diff options
Diffstat (limited to 'lib/Target/AArch64')
79 files changed, 11776 insertions, 5404 deletions
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h index 21106c9ad29a6..c767c75fce573 100644 --- a/lib/Target/AArch64/AArch64.h +++ b/lib/Target/AArch64/AArch64.h @@ -27,6 +27,7 @@ class FunctionPass; class MachineFunctionPass; FunctionPass *createAArch64DeadRegisterDefinitions(); +FunctionPass *createAArch64RedundantCopyEliminationPass(); FunctionPass *createAArch64ConditionalCompares(); FunctionPass *createAArch64AdvSIMDScalar(); FunctionPass *createAArch64BranchRelaxation(); @@ -44,6 +45,8 @@ FunctionPass *createAArch64A53Fix835769(); FunctionPass *createAArch64CleanupLocalDynamicTLSPass(); FunctionPass *createAArch64CollectLOHPass(); + +void initializeAArch64ExpandPseudoPass(PassRegistry&); } // end namespace llvm #endif diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td index cd3e84d38fe2f..b1e881685b0c6 100644 --- a/lib/Target/AArch64/AArch64.td +++ b/lib/Target/AArch64/AArch64.td @@ -11,7 +11,7 @@ //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// -// Target-independent interfaces which we are implementing +// Target-independent interfaces which we are implementing. //===----------------------------------------------------------------------===// include "llvm/Target/Target.td" @@ -32,6 +32,9 @@ def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true", def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true", "Enable ARMv8 CRC-32 checksum instructions">; +def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true", + "Enable ARMv8 Reliability, Availability and Serviceability Extensions">; + def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true", "Enable ARMv8 PMUv3 Performance Monitors extension">; @@ -58,6 +61,50 @@ def FeatureReserveX18 : SubtargetFeature<"reserve-x18", "ReserveX18", "true", "Reserve X18, making it unavailable " "as a GPR">; +def FeatureMergeNarrowLd : SubtargetFeature<"merge-narrow-ld", + "MergeNarrowLoads", "true", + "Merge narrow load instructions">; + +def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true", + "Use alias analysis during codegen">; + +def FeatureBalanceFPOps : SubtargetFeature<"balance-fp-ops", "BalanceFPOps", + "true", + "balance mix of odd and even D-registers for fp multiply(-accumulate) ops">; + +def FeaturePredictableSelectIsExpensive : SubtargetFeature< + "predictable-select-expensive", "PredictableSelectIsExpensive", "true", + "Prefer likely predicted branches over selects">; + +def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move", + "CustomAsCheapAsMove", "true", + "Use custom code for TargetInstrInfo::isAsCheapAsAMove()">; + +def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler", + "UsePostRAScheduler", "true", "Schedule again after register allocation">; + +def FeatureSlowMisaligned128Store : SubtargetFeature<"slow-misaligned-128store", + "Misaligned128StoreIsSlow", "true", "Misaligned 128 bit stores are slow">; + +def FeatureAvoidQuadLdStPairs : SubtargetFeature<"no-quad-ldst-pairs", + "AvoidQuadLdStPairs", "true", + "Do not form quad load/store pair operations">; + +def FeatureAlternateSExtLoadCVTF32Pattern : SubtargetFeature< + "alternate-sextload-cvt-f32-pattern", "UseAlternateSExtLoadCVTF32Pattern", + "true", "Use alternative pattern for sextload convert to f32">; + +def FeatureMacroOpFusion : SubtargetFeature< + "macroop-fusion", "HasMacroOpFusion", "true", + "CPU supports macro op fusion">; + +def FeatureDisableLatencySchedHeuristic : SubtargetFeature< + "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true", + "Disable latency scheduling heuristic">; + +def FeatureUseRSqrt : SubtargetFeature< + "use-reverse-square-root", "UseRSqrt", "true", "Use reverse square root">; + //===----------------------------------------------------------------------===// // Architectures. // @@ -66,7 +113,7 @@ def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true", "Support ARM v8.1a instructions", [FeatureCRC]>; def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true", - "Support ARM v8.2a instructions", [HasV8_1aOps]>; + "Support ARM v8.2a instructions", [HasV8_1aOps, FeatureRAS]>; //===----------------------------------------------------------------------===// // Register File Description @@ -85,67 +132,145 @@ include "AArch64InstrInfo.td" def AArch64InstrInfo : InstrInfo; //===----------------------------------------------------------------------===// +// Named operands for MRS/MSR/TLBI/... +//===----------------------------------------------------------------------===// + +include "AArch64SystemOperands.td" + +//===----------------------------------------------------------------------===// // AArch64 Processors supported. // include "AArch64SchedA53.td" include "AArch64SchedA57.td" include "AArch64SchedCyclone.td" include "AArch64SchedM1.td" +include "AArch64SchedKryo.td" +include "AArch64SchedVulcan.td" def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", - "Cortex-A35 ARM processors", - [FeatureFPARMv8, - FeatureNEON, - FeatureCrypto, + "Cortex-A35 ARM processors", [ FeatureCRC, - FeaturePerfMon]>; + FeatureCrypto, + FeatureFPARMv8, + FeatureNEON, + FeaturePerfMon + ]>; def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", - "Cortex-A53 ARM processors", - [FeatureFPARMv8, - FeatureNEON, - FeatureCrypto, + "Cortex-A53 ARM processors", [ + FeatureBalanceFPOps, FeatureCRC, - FeaturePerfMon]>; + FeatureCrypto, + FeatureCustomCheapAsMoveHandling, + FeatureFPARMv8, + FeatureNEON, + FeaturePerfMon, + FeaturePostRAScheduler, + FeatureUseAA + ]>; def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", - "Cortex-A57 ARM processors", - [FeatureFPARMv8, + "Cortex-A57 ARM processors", [ + FeatureBalanceFPOps, + FeatureCRC, + FeatureCrypto, + FeatureCustomCheapAsMoveHandling, + FeatureFPARMv8, + FeatureMergeNarrowLd, FeatureNEON, + FeaturePerfMon, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive + ]>; + +def ProcA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72", + "Cortex-A72 ARM processors", [ + FeatureCRC, FeatureCrypto, + FeatureFPARMv8, + FeatureNEON, + FeaturePerfMon + ]>; + +def ProcA73 : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73", + "Cortex-A73 ARM processors", [ FeatureCRC, - FeaturePerfMon]>; + FeatureCrypto, + FeatureFPARMv8, + FeatureNEON, + FeaturePerfMon + ]>; def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone", - "Cyclone", - [FeatureFPARMv8, - FeatureNEON, + "Cyclone", [ + FeatureAlternateSExtLoadCVTF32Pattern, FeatureCrypto, - FeatureCRC, + FeatureDisableLatencySchedHeuristic, + FeatureFPARMv8, + FeatureMacroOpFusion, + FeatureNEON, FeaturePerfMon, - FeatureZCRegMove, FeatureZCZeroing]>; + FeatureSlowMisaligned128Store, + FeatureZCRegMove, + FeatureZCZeroing + ]>; def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1", - "Samsung Exynos-M1 processors", - [FeatureFPARMv8, - FeatureNEON, - FeatureCrypto, + "Samsung Exynos-M1 processors", [ + FeatureAvoidQuadLdStPairs, FeatureCRC, - FeaturePerfMon]>; + FeatureCrypto, + FeatureCustomCheapAsMoveHandling, + FeatureFPARMv8, + FeatureNEON, + FeaturePerfMon, + FeaturePostRAScheduler, + FeatureUseRSqrt + ]>; + +def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo", + "Qualcomm Kryo processors", [ + FeatureCRC, + FeatureCrypto, + FeatureCustomCheapAsMoveHandling, + FeatureFPARMv8, + FeatureMergeNarrowLd, + FeatureNEON, + FeaturePerfMon, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive, + FeatureZCZeroing + ]>; + +def ProcVulcan : SubtargetFeature<"vulcan", "ARMProcFamily", "Vulcan", + "Broadcom Vulcan processors", [ + FeatureCRC, + FeatureCrypto, + FeatureFPARMv8, + FeatureMacroOpFusion, + FeatureNEON, + FeaturePostRAScheduler, + HasV8_1aOps]>; -def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8, - FeatureNEON, - FeatureCRC, - FeaturePerfMon]>; +def : ProcessorModel<"generic", NoSchedModel, [ + FeatureCRC, + FeatureFPARMv8, + FeatureNEON, + FeaturePerfMon, + FeaturePostRAScheduler + ]>; // FIXME: Cortex-A35 is currently modelled as a Cortex-A53 def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>; def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>; def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>; -// FIXME: Cortex-A72 is currently modelled as an Cortex-A57. -def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA57]>; +// FIXME: Cortex-A72 and Cortex-A73 are currently modelled as an Cortex-A57. +def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA72]>; +def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>; def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>; def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>; +def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>; +def : ProcessorModel<"vulcan", VulcanModel, [ProcVulcan]>; //===----------------------------------------------------------------------===// // Assembly parser diff --git a/lib/Target/AArch64/AArch64A53Fix835769.cpp b/lib/Target/AArch64/AArch64A53Fix835769.cpp index d215d9e831c06..c2cca63f49774 100644 --- a/lib/Target/AArch64/AArch64A53Fix835769.cpp +++ b/lib/Target/AArch64/AArch64A53Fix835769.cpp @@ -22,7 +22,6 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" @@ -87,6 +86,11 @@ public: bool runOnMachineFunction(MachineFunction &F) override; + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::AllVRegsAllocated); + } + const char *getPassName() const override { return "Workaround A53 erratum 835769 pass"; } @@ -133,8 +137,8 @@ static MachineBasicBlock *getBBFallenThrough(MachineBasicBlock *MBB, MachineBasicBlock *PrevBB = &*std::prev(MBBI); for (MachineBasicBlock *S : MBB->predecessors()) - if (S == PrevBB && !TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond) && - !TBB && !FBB) + if (S == PrevBB && !TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) && !TBB && + !FBB) return S; return nullptr; diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp index 3d1ab4e3fc2b6..0465e59dc54a6 100644 --- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp +++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp @@ -43,7 +43,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include <list> using namespace llvm; #define DEBUG_TYPE "aarch64-a57-fp-load-balancing" @@ -125,6 +124,11 @@ public: bool runOnMachineFunction(MachineFunction &F) override; + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::AllVRegsAllocated); + } + const char *getPassName() const override { return "A57 FP Anti-dependency breaker"; } @@ -222,7 +226,7 @@ public: } /// Return true if MI is a member of the chain. - bool contains(MachineInstr *MI) { return Insts.count(MI) > 0; } + bool contains(MachineInstr &MI) { return Insts.count(&MI) > 0; } /// Return the number of instructions in the chain. unsigned size() const { @@ -248,9 +252,10 @@ public: MachineInstr *getKill() const { return KillInst; } /// Return an instruction that can be used as an iterator for the end /// of the chain. This is the maximum of KillInst (if set) and LastInst. - MachineBasicBlock::iterator getEnd() const { + MachineBasicBlock::iterator end() const { return ++MachineBasicBlock::iterator(KillInst ? KillInst : LastInst); } + MachineBasicBlock::iterator begin() const { return getStart(); } /// Can the Kill instruction (assuming one exists) be modified? bool isKillImmutable() const { return KillIsImmutable; } @@ -307,9 +312,10 @@ public: //===----------------------------------------------------------------------===// bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) { - // Don't do anything if this isn't an A53 or A57. - if (!(F.getSubtarget<AArch64Subtarget>().isCortexA53() || - F.getSubtarget<AArch64Subtarget>().isCortexA57())) + if (skipFunction(*F.getFunction())) + return false; + + if (!F.getSubtarget<AArch64Subtarget>().balanceFPOps()) return false; bool Changed = false; @@ -492,15 +498,14 @@ bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV, int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C, MachineBasicBlock &MBB) { RegScavenger RS; - RS.enterBasicBlock(&MBB); + RS.enterBasicBlock(MBB); RS.forward(MachineBasicBlock::iterator(G->getStart())); // Can we find an appropriate register that is available throughout the life // of the chain? unsigned RegClassID = G->getStart()->getDesc().OpInfo[0].RegClass; BitVector AvailableRegs = RS.getRegsAvailable(TRI->getRegClass(RegClassID)); - for (MachineBasicBlock::iterator I = G->getStart(), E = G->getEnd(); - I != E; ++I) { + for (MachineBasicBlock::iterator I = G->begin(), E = G->end(); I != E; ++I) { RS.forward(I); AvailableRegs &= RS.getRegsAvailable(TRI->getRegClass(RegClassID)); @@ -530,8 +535,7 @@ int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C, for (auto Reg : Ord) { if (!AvailableRegs[Reg]) continue; - if ((C == Color::Even && (Reg % 2) == 0) || - (C == Color::Odd && (Reg % 2) == 1)) + if (C == getColor(Reg)) return Reg; } @@ -554,16 +558,14 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C, DEBUG(dbgs() << " - Scavenged register: " << TRI->getName(Reg) << "\n"); std::map<unsigned, unsigned> Substs; - for (MachineBasicBlock::iterator I = G->getStart(), E = G->getEnd(); - I != E; ++I) { - if (!G->contains(I) && - (&*I != G->getKill() || G->isKillImmutable())) + for (MachineInstr &I : *G) { + if (!G->contains(I) && (&I != G->getKill() || G->isKillImmutable())) continue; // I is a member of G, or I is a mutable instruction that kills G. std::vector<unsigned> ToErase; - for (auto &U : I->operands()) { + for (auto &U : I.operands()) { if (U.isReg() && U.isUse() && Substs.find(U.getReg()) != Substs.end()) { unsigned OrigReg = U.getReg(); U.setReg(Substs[OrigReg]); @@ -583,11 +585,11 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C, Substs.erase(J); // Only change the def if this isn't the last instruction. - if (&*I != G->getKill()) { - MachineOperand &MO = I->getOperand(0); + if (&I != G->getKill()) { + MachineOperand &MO = I.getOperand(0); bool Change = TransformAll || getColor(MO.getReg()) != C; - if (G->requiresFixup() && &*I == G->getLast()) + if (G->requiresFixup() && &I == G->getLast()) Change = false; if (Change) { diff --git a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp index 3afcdfb8b930d..4846ef08c983c 100644 --- a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp +++ b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp @@ -20,10 +20,9 @@ // e = getelementptr ..., i64 a // // This is legal to do if the computations are marked with either nsw or nuw -// markers. -// Moreover, the current heuristic is simple: it does not create new sext -// operations, i.e., it gives up when a sext would have forked (e.g., if -// a = add i32 b, c, two sexts are required to promote the computation). +// markers. Moreover, the current heuristic is simple: it does not create new +// sext operations, i.e., it gives up when a sext would have forked (e.g., if a +// = add i32 b, c, two sexts are required to promote the computation). // // FIXME: This pass may be useful for other targets too. // ===---------------------------------------------------------------------===// @@ -207,9 +206,7 @@ bool AArch64AddressTypePromotion::shouldGetThrough(const Instruction *Inst) { } static bool shouldSExtOperand(const Instruction *Inst, int OpIdx) { - if (isa<SelectInst>(Inst) && OpIdx == 0) - return false; - return true; + return !(isa<SelectInst>(Inst) && OpIdx == 0); } bool @@ -481,6 +478,9 @@ void AArch64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) { } bool AArch64AddressTypePromotion::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + if (!EnableAddressTypePromotion || F.isDeclaration()) return false; Func = &F; diff --git a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp index 1644d71d2821d..d0a2dd3fa1fc0 100644 --- a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp +++ b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp @@ -76,12 +76,12 @@ private: // isProfitableToTransform - Predicate function to determine whether an // instruction should be transformed to its equivalent AdvSIMD scalar // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example. - bool isProfitableToTransform(const MachineInstr *MI) const; + bool isProfitableToTransform(const MachineInstr &MI) const; // transformInstruction - Perform the transformation of an instruction // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs // to be the correct register class, minimizing cross-class copies. - void transformInstruction(MachineInstr *MI); + void transformInstruction(MachineInstr &MI); // processMachineBasicBlock - Main optimzation loop. bool processMachineBasicBlock(MachineBasicBlock *MBB); @@ -132,19 +132,19 @@ static bool isFPR64(unsigned Reg, unsigned SubReg, // getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64 // copy instruction. Return zero_reg if the instruction is not a copy. -static unsigned getSrcFromCopy(const MachineInstr *MI, - const MachineRegisterInfo *MRI, - unsigned &SubReg) { +static MachineOperand *getSrcFromCopy(MachineInstr *MI, + const MachineRegisterInfo *MRI, + unsigned &SubReg) { SubReg = 0; // The "FMOV Xd, Dn" instruction is the typical form. if (MI->getOpcode() == AArch64::FMOVDXr || MI->getOpcode() == AArch64::FMOVXDr) - return MI->getOperand(1).getReg(); + return &MI->getOperand(1); // A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see // these at this stage, but it's easy to check for. if (MI->getOpcode() == AArch64::UMOVvi64 && MI->getOperand(2).getImm() == 0) { SubReg = AArch64::dsub; - return MI->getOperand(1).getReg(); + return &MI->getOperand(1); } // Or just a plain COPY instruction. This can be directly to/from FPR64, // or it can be a dsub subreg reference to an FPR128. @@ -152,18 +152,18 @@ static unsigned getSrcFromCopy(const MachineInstr *MI, if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(), MRI) && isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI)) - return MI->getOperand(1).getReg(); + return &MI->getOperand(1); if (isGPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(), MRI) && isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI)) { SubReg = MI->getOperand(1).getSubReg(); - return MI->getOperand(1).getReg(); + return &MI->getOperand(1); } } // Otherwise, this is some other kind of instruction. - return 0; + return nullptr; } // getTransformOpcode - For any opcode for which there is an AdvSIMD equivalent @@ -189,16 +189,16 @@ static unsigned getTransformOpcode(unsigned Opc) { return Opc; } -static bool isTransformable(const MachineInstr *MI) { - unsigned Opc = MI->getOpcode(); +static bool isTransformable(const MachineInstr &MI) { + unsigned Opc = MI.getOpcode(); return Opc != getTransformOpcode(Opc); } // isProfitableToTransform - Predicate function to determine whether an // instruction should be transformed to its equivalent AdvSIMD scalar // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example. -bool -AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const { +bool AArch64AdvSIMDScalar::isProfitableToTransform( + const MachineInstr &MI) const { // If this instruction isn't eligible to be transformed (no SIMD equivalent), // early exit since that's the common case. if (!isTransformable(MI)) @@ -209,33 +209,33 @@ AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const { unsigned NumNewCopies = 3; unsigned NumRemovableCopies = 0; - unsigned OrigSrc0 = MI->getOperand(1).getReg(); - unsigned OrigSrc1 = MI->getOperand(2).getReg(); - unsigned Src0 = 0, SubReg0; - unsigned Src1 = 0, SubReg1; + unsigned OrigSrc0 = MI.getOperand(1).getReg(); + unsigned OrigSrc1 = MI.getOperand(2).getReg(); + unsigned SubReg0; + unsigned SubReg1; if (!MRI->def_empty(OrigSrc0)) { MachineRegisterInfo::def_instr_iterator Def = MRI->def_instr_begin(OrigSrc0); assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); - Src0 = getSrcFromCopy(&*Def, MRI, SubReg0); + MachineOperand *MOSrc0 = getSrcFromCopy(&*Def, MRI, SubReg0); // If the source was from a copy, we don't need to insert a new copy. - if (Src0) + if (MOSrc0) --NumNewCopies; // If there are no other users of the original source, we can delete // that instruction. - if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0)) + if (MOSrc0 && MRI->hasOneNonDBGUse(OrigSrc0)) ++NumRemovableCopies; } if (!MRI->def_empty(OrigSrc1)) { MachineRegisterInfo::def_instr_iterator Def = MRI->def_instr_begin(OrigSrc1); assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); - Src1 = getSrcFromCopy(&*Def, MRI, SubReg1); - if (Src1) + MachineOperand *MOSrc1 = getSrcFromCopy(&*Def, MRI, SubReg1); + if (MOSrc1) --NumNewCopies; // If there are no other users of the original source, we can delete // that instruction. - if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1)) + if (MOSrc1 && MRI->hasOneNonDBGUse(OrigSrc1)) ++NumRemovableCopies; } @@ -244,14 +244,14 @@ AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const { // any of the uses is a transformable instruction, it's likely the tranforms // will chain, enabling us to save a copy there, too. This is an aggressive // heuristic that approximates the graph based cost analysis described above. - unsigned Dst = MI->getOperand(0).getReg(); + unsigned Dst = MI.getOperand(0).getReg(); bool AllUsesAreCopies = true; for (MachineRegisterInfo::use_instr_nodbg_iterator Use = MRI->use_instr_nodbg_begin(Dst), E = MRI->use_instr_nodbg_end(); Use != E; ++Use) { unsigned SubReg; - if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(&*Use)) + if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(*Use)) ++NumRemovableCopies; // If the use is an INSERT_SUBREG, that's still something that can // directly use the FPR64, so we don't invalidate AllUsesAreCopies. It's @@ -279,12 +279,11 @@ AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const { return TransformAll; } -static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr *MI, +static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr &MI, unsigned Dst, unsigned Src, bool IsKill) { - MachineInstrBuilder MIB = - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AArch64::COPY), - Dst) - .addReg(Src, getKillRegState(IsKill)); + MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), + TII->get(AArch64::COPY), Dst) + .addReg(Src, getKillRegState(IsKill)); DEBUG(dbgs() << " adding copy: " << *MIB); ++NumCopiesInserted; return MIB; @@ -293,43 +292,56 @@ static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr *MI, // transformInstruction - Perform the transformation of an instruction // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs // to be the correct register class, minimizing cross-class copies. -void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) { - DEBUG(dbgs() << "Scalar transform: " << *MI); +void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) { + DEBUG(dbgs() << "Scalar transform: " << MI); - MachineBasicBlock *MBB = MI->getParent(); - unsigned OldOpc = MI->getOpcode(); + MachineBasicBlock *MBB = MI.getParent(); + unsigned OldOpc = MI.getOpcode(); unsigned NewOpc = getTransformOpcode(OldOpc); assert(OldOpc != NewOpc && "transform an instruction to itself?!"); // Check if we need a copy for the source registers. - unsigned OrigSrc0 = MI->getOperand(1).getReg(); - unsigned OrigSrc1 = MI->getOperand(2).getReg(); + unsigned OrigSrc0 = MI.getOperand(1).getReg(); + unsigned OrigSrc1 = MI.getOperand(2).getReg(); unsigned Src0 = 0, SubReg0; unsigned Src1 = 0, SubReg1; + bool KillSrc0 = false, KillSrc1 = false; if (!MRI->def_empty(OrigSrc0)) { MachineRegisterInfo::def_instr_iterator Def = MRI->def_instr_begin(OrigSrc0); assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); - Src0 = getSrcFromCopy(&*Def, MRI, SubReg0); + MachineOperand *MOSrc0 = getSrcFromCopy(&*Def, MRI, SubReg0); // If there are no other users of the original source, we can delete // that instruction. - if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0)) { - assert(Src0 && "Can't delete copy w/o a valid original source!"); - Def->eraseFromParent(); - ++NumCopiesDeleted; + if (MOSrc0) { + Src0 = MOSrc0->getReg(); + KillSrc0 = MOSrc0->isKill(); + // Src0 is going to be reused, thus, it cannot be killed anymore. + MOSrc0->setIsKill(false); + if (MRI->hasOneNonDBGUse(OrigSrc0)) { + assert(MOSrc0 && "Can't delete copy w/o a valid original source!"); + Def->eraseFromParent(); + ++NumCopiesDeleted; + } } } if (!MRI->def_empty(OrigSrc1)) { MachineRegisterInfo::def_instr_iterator Def = MRI->def_instr_begin(OrigSrc1); assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); - Src1 = getSrcFromCopy(&*Def, MRI, SubReg1); + MachineOperand *MOSrc1 = getSrcFromCopy(&*Def, MRI, SubReg1); // If there are no other users of the original source, we can delete // that instruction. - if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1)) { - assert(Src1 && "Can't delete copy w/o a valid original source!"); - Def->eraseFromParent(); - ++NumCopiesDeleted; + if (MOSrc1) { + Src1 = MOSrc1->getReg(); + KillSrc1 = MOSrc1->isKill(); + // Src0 is going to be reused, thus, it cannot be killed anymore. + MOSrc1->setIsKill(false); + if (MRI->hasOneNonDBGUse(OrigSrc1)) { + assert(MOSrc1 && "Can't delete copy w/o a valid original source!"); + Def->eraseFromParent(); + ++NumCopiesDeleted; + } } } // If we weren't able to reference the original source directly, create a @@ -337,12 +349,14 @@ void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) { if (!Src0) { SubReg0 = 0; Src0 = MRI->createVirtualRegister(&AArch64::FPR64RegClass); - insertCopy(TII, MI, Src0, OrigSrc0, true); + insertCopy(TII, MI, Src0, OrigSrc0, KillSrc0); + KillSrc0 = true; } if (!Src1) { SubReg1 = 0; Src1 = MRI->createVirtualRegister(&AArch64::FPR64RegClass); - insertCopy(TII, MI, Src1, OrigSrc1, true); + insertCopy(TII, MI, Src1, OrigSrc1, KillSrc1); + KillSrc1 = true; } // Create a vreg for the destination. @@ -353,17 +367,17 @@ void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) { // For now, all of the new instructions have the same simple three-register // form, so no need to special case based on what instruction we're // building. - BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(NewOpc), Dst) - .addReg(Src0, getKillRegState(true), SubReg0) - .addReg(Src1, getKillRegState(true), SubReg1); + BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), Dst) + .addReg(Src0, getKillRegState(KillSrc0), SubReg0) + .addReg(Src1, getKillRegState(KillSrc1), SubReg1); // Now copy the result back out to a GPR. // FIXME: Try to avoid this if all uses could actually just use the FPR64 // directly. - insertCopy(TII, MI, MI->getOperand(0).getReg(), Dst, true); + insertCopy(TII, MI, MI.getOperand(0).getReg(), Dst, true); // Erase the old instruction. - MI->eraseFromParent(); + MI.eraseFromParent(); ++NumScalarInsnsUsed; } @@ -372,8 +386,7 @@ void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) { bool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) { bool Changed = false; for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) { - MachineInstr *MI = I; - ++I; + MachineInstr &MI = *I++; if (isProfitableToTransform(MI)) { transformInstruction(MI); Changed = true; @@ -387,6 +400,9 @@ bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) { bool Changed = false; DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n"); + if (skipFunction(*mf.getFunction())) + return false; + MRI = &mf.getRegInfo(); TII = mf.getSubtarget().getInstrInfo(); diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp index ada995bad37e6..22374f754603d 100644 --- a/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -49,6 +49,7 @@ namespace { class AArch64AsmPrinter : public AsmPrinter { AArch64MCInstLower MCInstLowering; StackMaps SM; + const AArch64Subtarget *STI; public: AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) @@ -83,11 +84,11 @@ public: bool runOnMachineFunction(MachineFunction &F) override { AArch64FI = F.getInfo<AArch64FunctionInfo>(); + STI = static_cast<const AArch64Subtarget*>(&F.getSubtarget()); return AsmPrinter::runOnMachineFunction(F); } private: - MachineLocation getDebugValueLocation(const MachineInstr *MI) const; void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O); bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O); bool printAsmRegInClass(const MachineOperand &MO, @@ -112,6 +113,9 @@ private: /// \brief Emit the LOHs contained in AArch64FI. void EmitLOHs(); + /// Emit instruction to set float register to zero. + void EmitFMov0(const MachineInstr &MI); + typedef std::map<const MachineInstr *, MCSymbol *> MInstToMCSymbol; MInstToMCSymbol LOHInstToLabel; }; @@ -133,19 +137,6 @@ void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) { } } -MachineLocation -AArch64AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const { - MachineLocation Location; - assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!"); - // Frame address. Currently handles register +- offset only. - if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm()) - Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm()); - else { - DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n"); - } - return Location; -} - void AArch64AsmPrinter::EmitLOHs() { SmallVector<MCSymbol *, 3> MCArgs; @@ -238,8 +229,7 @@ bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO, const TargetRegisterClass *RC, bool isVector, raw_ostream &O) { assert(MO.isReg() && "Should only get here with a register!"); - const AArch64RegisterInfo *RI = - MF->getSubtarget<AArch64Subtarget>().getRegisterInfo(); + const TargetRegisterInfo *RI = STI->getRegisterInfo(); unsigned Reg = MO.getReg(); unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg)); assert(RI->regsOverlap(RegToPrint, Reg)); @@ -404,16 +394,16 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg(); EncodedBytes = 16; // Materialize the jump address: - EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZWi) + EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZXi) .addReg(ScratchReg) .addImm((CallTarget >> 32) & 0xFFFF) .addImm(32)); - EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi) + EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKXi) .addReg(ScratchReg) .addReg(ScratchReg) .addImm((CallTarget >> 16) & 0xFFFF) .addImm(16)); - EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi) + EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKXi) .addReg(ScratchReg) .addReg(ScratchReg) .addImm(CallTarget & 0xFFFF) @@ -430,6 +420,40 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0)); } +void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) { + unsigned DestReg = MI.getOperand(0).getReg(); + if (STI->hasZeroCycleZeroing()) { + // Convert S/D register to corresponding Q register + if (AArch64::S0 <= DestReg && DestReg <= AArch64::S31) { + DestReg = AArch64::Q0 + (DestReg - AArch64::S0); + } else { + assert(AArch64::D0 <= DestReg && DestReg <= AArch64::D31); + DestReg = AArch64::Q0 + (DestReg - AArch64::D0); + } + MCInst MOVI; + MOVI.setOpcode(AArch64::MOVIv2d_ns); + MOVI.addOperand(MCOperand::createReg(DestReg)); + MOVI.addOperand(MCOperand::createImm(0)); + EmitToStreamer(*OutStreamer, MOVI); + } else { + MCInst FMov; + switch (MI.getOpcode()) { + default: llvm_unreachable("Unexpected opcode"); + case AArch64::FMOVS0: + FMov.setOpcode(AArch64::FMOVWSr); + FMov.addOperand(MCOperand::createReg(DestReg)); + FMov.addOperand(MCOperand::createReg(AArch64::WZR)); + break; + case AArch64::FMOVD0: + FMov.setOpcode(AArch64::FMOVXDr); + FMov.addOperand(MCOperand::createReg(DestReg)); + FMov.addOperand(MCOperand::createReg(AArch64::XZR)); + break; + } + EmitToStreamer(*OutStreamer, FMov); + } +} + // Simple pseudo-instructions have their lowering (with expansion to real // instructions) auto-generated. #include "AArch64GenMCPseudoLowering.inc" @@ -535,6 +559,11 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } + case AArch64::FMOVS0: + case AArch64::FMOVD0: + EmitFMov0(*MI); + return; + case TargetOpcode::STACKMAP: return LowerSTACKMAP(*OutStreamer, SM, *MI); diff --git a/lib/Target/AArch64/AArch64BranchRelaxation.cpp b/lib/Target/AArch64/AArch64BranchRelaxation.cpp index a614f555a4e9f..9ec6ae4118a44 100644 --- a/lib/Target/AArch64/AArch64BranchRelaxation.cpp +++ b/lib/Target/AArch64/AArch64BranchRelaxation.cpp @@ -177,7 +177,7 @@ void AArch64BranchRelaxation::scanFunction() { void AArch64BranchRelaxation::computeBlockSize(const MachineBasicBlock &MBB) { unsigned Size = 0; for (const MachineInstr &MI : MBB) - Size += TII->GetInstSizeInBytes(&MI); + Size += TII->GetInstSizeInBytes(MI); BlockInfo[MBB.getNumber()].Size = Size; } @@ -195,7 +195,7 @@ unsigned AArch64BranchRelaxation::getInstrOffset(MachineInstr *MI) const { // Sum instructions before MI in MBB. for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) { assert(I != MBB->end() && "Didn't find MI in its own basic block?"); - Offset += TII->GetInstSizeInBytes(I); + Offset += TII->GetInstSizeInBytes(*I); } return Offset; } @@ -415,12 +415,12 @@ bool AArch64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) { // Analyze the branch so we know how to update the successor lists. MachineBasicBlock *TBB, *FBB; SmallVector<MachineOperand, 2> Cond; - TII->AnalyzeBranch(*MBB, TBB, FBB, Cond, false); + TII->analyzeBranch(*MBB, TBB, FBB, Cond, false); MachineBasicBlock *NewBB = splitBlockBeforeInstr(MI); // No need for the branch to the next block. We're adding an unconditional // branch to the destination. - int delta = TII->GetInstSizeInBytes(&MBB->back()); + int delta = TII->GetInstSizeInBytes(MBB->back()); BlockInfo[MBB->getNumber()].Size -= delta; MBB->back().eraseFromParent(); // BlockInfo[SplitBB].Offset is wrong temporarily, fixed below @@ -446,12 +446,12 @@ bool AArch64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) { if (MI->getOpcode() == AArch64::Bcc) invertBccCondition(MIB); MIB.addMBB(NextBB); - BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back()); + BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(MBB->back()); BuildMI(MBB, DebugLoc(), TII->get(AArch64::B)).addMBB(DestBB); - BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back()); + BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(MBB->back()); // Remove the old conditional branch. It may or may not still be in MBB. - BlockInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI); + BlockInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(*MI); MI->eraseFromParent(); // Finally, keep the block offsets up to date. @@ -463,12 +463,13 @@ bool AArch64BranchRelaxation::relaxBranchInstructions() { bool Changed = false; // Relaxing branches involves creating new basic blocks, so re-eval // end() for termination. - for (auto &MBB : *MF) { - MachineInstr *MI = MBB.getFirstTerminator(); - if (isConditionalBranch(MI->getOpcode()) && - !isBlockInRange(MI, getDestBlock(MI), - getBranchDisplacementBits(MI->getOpcode()))) { - fixupConditionalBranch(MI); + for (MachineFunction::iterator I = MF->begin(); I != MF->end(); ++I) { + MachineBasicBlock &MBB = *I; + MachineInstr &MI = *MBB.getFirstTerminator(); + if (isConditionalBranch(MI.getOpcode()) && + !isBlockInRange(&MI, getDestBlock(&MI), + getBranchDisplacementBits(MI.getOpcode()))) { + fixupConditionalBranch(&MI); ++NumRelaxed; Changed = true; } @@ -513,8 +514,7 @@ bool AArch64BranchRelaxation::runOnMachineFunction(MachineFunction &mf) { return MadeChange; } -/// createAArch64BranchRelaxation - returns an instance of the constpool -/// island pass. +/// Returns an instance of the AArch64 Branch Relaxation pass. FunctionPass *llvm::createAArch64BranchRelaxation() { return new AArch64BranchRelaxation(); } diff --git a/lib/Target/AArch64/AArch64CallLowering.cpp b/lib/Target/AArch64/AArch64CallLowering.cpp new file mode 100644 index 0000000000000..e3522e63c21c0 --- /dev/null +++ b/lib/Target/AArch64/AArch64CallLowering.cpp @@ -0,0 +1,104 @@ +//===-- llvm/lib/Target/AArch64/AArch64CallLowering.cpp - Call lowering ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements the lowering of LLVM calls to machine code calls for +/// GlobalISel. +/// +//===----------------------------------------------------------------------===// + +#include "AArch64CallLowering.h" +#include "AArch64ISelLowering.h" + +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" + +using namespace llvm; + +#ifndef LLVM_BUILD_GLOBAL_ISEL +#error "This shouldn't be built without GISel" +#endif + +AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI) + : CallLowering(&TLI) { +} + +bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, + const Value *Val, unsigned VReg) const { + MachineInstr *Return = MIRBuilder.buildInstr(AArch64::RET_ReallyLR); + assert(Return && "Unable to build a return instruction?!"); + + assert(((Val && VReg) || (!Val && !VReg)) && "Return value without a vreg"); + if (VReg) { + assert(Val->getType()->isIntegerTy() && "Type not supported yet"); + unsigned Size = Val->getType()->getPrimitiveSizeInBits(); + assert((Size == 64 || Size == 32) && "Size not supported yet"); + unsigned ResReg = (Size == 32) ? AArch64::W0 : AArch64::X0; + // Set the insertion point to be right before Return. + MIRBuilder.setInstr(*Return, /* Before */ true); + MachineInstr *Copy = + MIRBuilder.buildInstr(TargetOpcode::COPY, ResReg, VReg); + (void)Copy; + assert(Copy->getNextNode() == Return && + "The insertion did not happen where we expected"); + MachineInstrBuilder(MIRBuilder.getMF(), Return) + .addReg(ResReg, RegState::Implicit); + } + return true; +} + +bool AArch64CallLowering::lowerFormalArguments( + MachineIRBuilder &MIRBuilder, const Function::ArgumentListType &Args, + const SmallVectorImpl<unsigned> &VRegs) const { + MachineFunction &MF = MIRBuilder.getMF(); + const Function &F = *MF.getFunction(); + + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); + + unsigned NumArgs = Args.size(); + Function::const_arg_iterator CurOrigArg = Args.begin(); + const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>(); + for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) { + MVT ValVT = MVT::getVT(CurOrigArg->getType()); + CCAssignFn *AssignFn = + TLI.CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/false); + bool Res = + AssignFn(i, ValVT, ValVT, CCValAssign::Full, ISD::ArgFlagsTy(), CCInfo); + assert(!Res && "Call operand has unhandled type"); + (void)Res; + } + assert(ArgLocs.size() == Args.size() && + "We have a different number of location and args?!"); + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + + assert(VA.isRegLoc() && "Not yet implemented"); + // Transform the arguments in physical registers into virtual ones. + MIRBuilder.getMBB().addLiveIn(VA.getLocReg()); + MIRBuilder.buildInstr(TargetOpcode::COPY, VRegs[i], VA.getLocReg()); + + switch (VA.getLocInfo()) { + default: + llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: + break; + case CCValAssign::BCvt: + // We don't care about bitcast. + break; + case CCValAssign::AExt: + case CCValAssign::SExt: + case CCValAssign::ZExt: + // Zero/Sign extend the register. + assert(0 && "Not yet implemented"); + break; + } + } + return true; +} diff --git a/lib/Target/AArch64/AArch64CallLowering.h b/lib/Target/AArch64/AArch64CallLowering.h new file mode 100644 index 0000000000000..411622803461f --- /dev/null +++ b/lib/Target/AArch64/AArch64CallLowering.h @@ -0,0 +1,36 @@ +//===-- llvm/lib/Target/AArch64/AArch64CallLowering.h - Call lowering -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file describes how to lower LLVM calls to machine code calls. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING +#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING + +#include "llvm/CodeGen/GlobalISel/CallLowering.h" + +namespace llvm { + +class AArch64TargetLowering; + +class AArch64CallLowering: public CallLowering { + public: + AArch64CallLowering(const AArch64TargetLowering &TLI); + + bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val, + unsigned VReg) const override; + bool + lowerFormalArguments(MachineIRBuilder &MIRBuilder, + const Function::ArgumentListType &Args, + const SmallVectorImpl<unsigned> &VRegs) const override; +}; +} // End of namespace llvm; +#endif diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td index 388d64ec4e99d..178e3971640ed 100644 --- a/lib/Target/AArch64/AArch64CallingConvention.td +++ b/lib/Target/AArch64/AArch64CallingConvention.td @@ -45,6 +45,9 @@ def CC_AArch64_AAPCS : CallingConv<[ // supported there. CCIfNest<CCAssignToReg<[X18]>>, + // Pass SwiftSelf in a callee saved register. + CCIfSwiftSelf<CCIfType<[i64], CCAssignToRegWithShadow<[X20], [W20]>>>, + CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>, // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers, @@ -86,6 +89,8 @@ def RetCC_AArch64_AAPCS : CallingConv<[ CCIfType<[v2f32], CCBitConvertToType<v2i32>>, CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>, + CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X19], [W19]>>>, + // Big endian vectors must be passed as if they were 1-element vectors so that // their lanes are in a consistent order. CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8], @@ -126,6 +131,12 @@ def CC_AArch64_DarwinPCS : CallingConv<[ // slot is 64-bit. CCIfByVal<CCPassByVal<8, 8>>, + // Pass SwiftSelf in a callee saved register. + CCIfSwiftSelf<CCIfType<[i64], CCAssignToRegWithShadow<[X20], [W20]>>>, + + // A SwiftError is passed in X19. + CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X19], [W19]>>>, + CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>, // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers, @@ -270,6 +281,9 @@ def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22, // case) def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>; +def CSR_AArch64_AAPCS_SwiftError + : CalleeSavedRegs<(sub CSR_AArch64_AAPCS, X19)>; + // The function used by Darwin to obtain the address of a thread-local variable // guarantees more than a normal AAPCS function. x16 and x17 are used on the // fast path for calculation, but other registers except X0 (argument/return) @@ -310,3 +324,7 @@ def CSR_AArch64_AllRegs (sequence "Q%u", 0, 31))>; def CSR_AArch64_NoRegs : CalleeSavedRegs<(add)>; + +def CSR_AArch64_RT_MostRegs : CalleeSavedRegs<(add CSR_AArch64_AAPCS, + (sequence "X%u", 9, 15))>; + diff --git a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp index 9310ac4a44a2d..011a03622ba51 100644 --- a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp +++ b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp @@ -39,6 +39,9 @@ struct LDTLSCleanup : public MachineFunctionPass { LDTLSCleanup() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override { + if (skipFunction(*MF.getFunction())) + return false; + AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); if (AFI->getNumLocalDynamicTLSAccesses() < 2) { // No point folding accesses if there isn't at least two. @@ -69,9 +72,9 @@ struct LDTLSCleanup : public MachineFunctionPass { break; if (TLSBaseAddrReg) - I = replaceTLSBaseAddrCall(I, TLSBaseAddrReg); + I = replaceTLSBaseAddrCall(*I, TLSBaseAddrReg); else - I = setRegister(I, &TLSBaseAddrReg); + I = setRegister(*I, &TLSBaseAddrReg); Changed = true; break; default: @@ -89,27 +92,27 @@ struct LDTLSCleanup : public MachineFunctionPass { // Replace the TLS_base_addr instruction I with a copy from // TLSBaseAddrReg, returning the new instruction. - MachineInstr *replaceTLSBaseAddrCall(MachineInstr *I, + MachineInstr *replaceTLSBaseAddrCall(MachineInstr &I, unsigned TLSBaseAddrReg) { - MachineFunction *MF = I->getParent()->getParent(); + MachineFunction *MF = I.getParent()->getParent(); const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the // code sequence assumes the address will be. - MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(), - TII->get(TargetOpcode::COPY), - AArch64::X0).addReg(TLSBaseAddrReg); + MachineInstr *Copy = BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII->get(TargetOpcode::COPY), AArch64::X0) + .addReg(TLSBaseAddrReg); // Erase the TLS_base_addr instruction. - I->eraseFromParent(); + I.eraseFromParent(); return Copy; } // Create a virtal register in *TLSBaseAddrReg, and populate it by // inserting a copy instruction after I. Returns the new instruction. - MachineInstr *setRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) { - MachineFunction *MF = I->getParent()->getParent(); + MachineInstr *setRegister(MachineInstr &I, unsigned *TLSBaseAddrReg) { + MachineFunction *MF = I.getParent()->getParent(); const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); // Create a virtual register for the TLS base address. @@ -118,7 +121,7 @@ struct LDTLSCleanup : public MachineFunctionPass { // Insert a copy from X0 to TLSBaseAddrReg for later. MachineInstr *Copy = - BuildMI(*I->getParent(), ++I->getIterator(), I->getDebugLoc(), + BuildMI(*I.getParent(), ++I.getIterator(), I.getDebugLoc(), TII->get(TargetOpcode::COPY), *TLSBaseAddrReg) .addReg(AArch64::X0); diff --git a/lib/Target/AArch64/AArch64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp index 78c239b11ef31..5eecb3a868566 100644 --- a/lib/Target/AArch64/AArch64CollectLOH.cpp +++ b/lib/Target/AArch64/AArch64CollectLOH.cpp @@ -179,6 +179,11 @@ struct AArch64CollectLOH : public MachineFunctionPass { bool runOnMachineFunction(MachineFunction &MF) override; + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::AllVRegsAllocated); + } + const char *getPassName() const override { return AARCH64_COLLECT_LOH_NAME; } @@ -623,10 +628,7 @@ static void computeADRP(const InstrToInstrs &UseToDefs, continue; } DEBUG(dbgs() << "Record AdrpAdrp:\n" << *L2 << '\n' << *L1 << '\n'); - SmallVector<const MachineInstr *, 2> Args; - Args.push_back(L2); - Args.push_back(L1); - AArch64FI.addLOHDirective(MCLOH_AdrpAdrp, Args); + AArch64FI.addLOHDirective(MCLOH_AdrpAdrp, {L2, L1}); ++NumADRPSimpleCandidate; } #ifdef DEBUG @@ -760,13 +762,9 @@ static bool registerADRCandidate(const MachineInstr &Use, "ADD already involved in LOH."); DEBUG(dbgs() << "Record AdrpAdd\n" << Def << '\n' << Use << '\n'); - SmallVector<const MachineInstr *, 2> Args; - Args.push_back(&Def); - Args.push_back(&Use); - - AArch64FI.addLOHDirective(Use.getOpcode() == AArch64::ADDXri ? MCLOH_AdrpAdd - : MCLOH_AdrpLdrGot, - Args); + AArch64FI.addLOHDirective( + Use.getOpcode() == AArch64::ADDXri ? MCLOH_AdrpAdd : MCLOH_AdrpLdrGot, + {&Def, &Use}); return true; } @@ -1036,6 +1034,9 @@ static void collectInvolvedReg(const MachineFunction &MF, MapRegToId &RegToId, } bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(*MF.getFunction())) + return false; + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); const MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>(); diff --git a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp index fc27bfee73d13..8fff381d391e9 100644 --- a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp +++ b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp @@ -70,7 +70,6 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" @@ -144,10 +143,18 @@ MachineInstr *AArch64ConditionOptimizer::findSuitableCompare( if (I->getOpcode() != AArch64::Bcc) return nullptr; + // Since we may modify cmp of this MBB, make sure NZCV does not live out. + for (auto SuccBB : MBB->successors()) + if (SuccBB->isLiveIn(AArch64::NZCV)) + return nullptr; + // Now find the instruction controlling the terminator. for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) { --I; assert(!I->isTerminator() && "Spurious terminator"); + // Check if there is any use of NZCV between CMP and Bcc. + if (I->readsRegister(AArch64::NZCV)) + return nullptr; switch (I->getOpcode()) { // cmp is an alias for subs with a dead destination register. case AArch64::SUBSWri: @@ -166,7 +173,7 @@ MachineInstr *AArch64ConditionOptimizer::findSuitableCompare( DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n'); return nullptr; } - return I; + return &*I; } // Prevent false positive case like: // cmp w19, #0 @@ -268,13 +275,13 @@ void AArch64ConditionOptimizer::modifyCmp(MachineInstr *CmpMI, // The fact that this comparison was picked ensures that it's related to the // first terminator instruction. - MachineInstr *BrMI = MBB->getFirstTerminator(); + MachineInstr &BrMI = *MBB->getFirstTerminator(); // Change condition in branch instruction. - BuildMI(*MBB, BrMI, BrMI->getDebugLoc(), TII->get(AArch64::Bcc)) + BuildMI(*MBB, BrMI, BrMI.getDebugLoc(), TII->get(AArch64::Bcc)) .addImm(Cmp) - .addOperand(BrMI->getOperand(1)); - BrMI->eraseFromParent(); + .addOperand(BrMI.getOperand(1)); + BrMI.eraseFromParent(); MBB->updateTerminator(); @@ -311,6 +318,9 @@ bool AArch64ConditionOptimizer::adjustTo(MachineInstr *CmpMI, bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) { DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n" << "********** Function: " << MF.getName() << '\n'); + if (skipFunction(*MF.getFunction())) + return false; + TII = MF.getSubtarget().getInstrInfo(); DomTree = &getAnalysis<MachineDominatorTree>(); MRI = &MF.getRegInfo(); @@ -327,7 +337,7 @@ bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) { SmallVector<MachineOperand, 4> HeadCond; MachineBasicBlock *TBB = nullptr, *FBB = nullptr; - if (TII->AnalyzeBranch(*HBB, TBB, FBB, HeadCond)) { + if (TII->analyzeBranch(*HBB, TBB, FBB, HeadCond)) { continue; } @@ -338,7 +348,7 @@ bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) { SmallVector<MachineOperand, 4> TrueCond; MachineBasicBlock *TBB_TBB = nullptr, *TBB_FBB = nullptr; - if (TII->AnalyzeBranch(*TBB, TBB_TBB, TBB_FBB, TrueCond)) { + if (TII->analyzeBranch(*TBB, TBB_TBB, TBB_FBB, TrueCond)) { continue; } diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp index df1320fbd4c95..e1b0dc724b39a 100644 --- a/lib/Target/AArch64/AArch64ConditionalCompares.cpp +++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp @@ -18,13 +18,10 @@ //===----------------------------------------------------------------------===// #include "AArch64.h" -#include "llvm/ADT/BitVector.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SparseSet.h" #include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -307,7 +304,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) { case AArch64::CBNZW: case AArch64::CBNZX: // These can be converted into a ccmp against #0. - return I; + return &*I; } ++NumCmpTermRejs; DEBUG(dbgs() << "Flags not used by terminator: " << *I); @@ -338,7 +335,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) { case AArch64::ADDSWrr: case AArch64::ADDSXrr: if (isDeadDef(I->getOperand(0).getReg())) - return I; + return &*I; DEBUG(dbgs() << "Can't convert compare with live destination: " << *I); ++NumLiveDstRejs; return nullptr; @@ -346,12 +343,12 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) { case AArch64::FCMPDrr: case AArch64::FCMPESrr: case AArch64::FCMPEDrr: - return I; + return &*I; } // Check for flag reads and clobbers. MIOperands::PhysRegInfo PRI = - MIOperands(I).analyzePhysReg(AArch64::NZCV, TRI); + MIOperands(*I).analyzePhysReg(AArch64::NZCV, TRI); if (PRI.Read) { // The ccmp doesn't produce exactly the same flags as the original @@ -496,7 +493,7 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) { // The branch we're looking to eliminate must be analyzable. HeadCond.clear(); MachineBasicBlock *TBB = nullptr, *FBB = nullptr; - if (TII->AnalyzeBranch(*Head, TBB, FBB, HeadCond)) { + if (TII->analyzeBranch(*Head, TBB, FBB, HeadCond)) { DEBUG(dbgs() << "Head branch not analyzable.\n"); ++NumHeadBranchRejs; return false; @@ -524,7 +521,7 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) { CmpBBCond.clear(); TBB = FBB = nullptr; - if (TII->AnalyzeBranch(*CmpBB, TBB, FBB, CmpBBCond)) { + if (TII->analyzeBranch(*CmpBB, TBB, FBB, CmpBBCond)) { DEBUG(dbgs() << "CmpBB branch not analyzable.\n"); ++NumCmpBranchRejs; return false; @@ -759,7 +756,6 @@ void initializeAArch64ConditionalComparesPass(PassRegistry &); INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp", "AArch64 CCMP Pass", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp", @@ -770,7 +766,6 @@ FunctionPass *llvm::createAArch64ConditionalCompares() { } void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<MachineBranchProbabilityInfo>(); AU.addRequired<MachineDominatorTree>(); AU.addPreserved<MachineDominatorTree>(); AU.addRequired<MachineLoopInfo>(); @@ -849,9 +844,9 @@ bool AArch64ConditionalCompares::shouldConvert() { // Instruction depths can be computed for all trace instructions above CmpBB. unsigned HeadDepth = - Trace.getInstrCycles(CmpConv.Head->getFirstTerminator()).Depth; + Trace.getInstrCycles(*CmpConv.Head->getFirstTerminator()).Depth; unsigned CmpBBDepth = - Trace.getInstrCycles(CmpConv.CmpBB->getFirstTerminator()).Depth; + Trace.getInstrCycles(*CmpConv.CmpBB->getFirstTerminator()).Depth; DEBUG(dbgs() << "Head depth: " << HeadDepth << "\nCmpBB depth: " << CmpBBDepth << '\n'); if (CmpBBDepth > HeadDepth + DelayLimit) { @@ -891,6 +886,9 @@ bool AArch64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) { bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) { DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n" << "********** Function: " << MF.getName() << '\n'); + if (skipFunction(*MF.getFunction())) + return false; + TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getSubtarget().getRegisterInfo(); SchedModel = MF.getSubtarget().getSchedModel(); diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp index 576cf4a741678..7a6f7669db5f3 100644 --- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp +++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp @@ -48,6 +48,11 @@ public: bool runOnMachineFunction(MachineFunction &F) override; + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::AllVRegsAllocated); + } + const char *getPassName() const override { return AARCH64_DEAD_REG_DEF_NAME; } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -88,6 +93,12 @@ bool AArch64DeadRegisterDefinitions::processMachineBasicBlock( DEBUG(dbgs() << " Ignoring, operand is frame index\n"); continue; } + if (MI.definesRegister(AArch64::XZR) || MI.definesRegister(AArch64::WZR)) { + // It is not allowed to write to the same register (not even the zero + // register) twice in a single instruction. + DEBUG(dbgs() << " Ignoring, XZR or WZR already used by the instruction\n"); + continue; + } for (int i = 0, e = MI.getDesc().getNumDefs(); i != e; ++i) { MachineOperand &MO = MI.getOperand(i); if (MO.isReg() && MO.isDead() && MO.isDef()) { @@ -100,7 +111,7 @@ bool AArch64DeadRegisterDefinitions::processMachineBasicBlock( continue; } // Don't change the register if there's an implicit def of a subreg or - // supperreg. + // superreg. if (implicitlyDefinesOverlappingReg(MO.getReg(), MI)) { DEBUG(dbgs() << " Ignoring, implicitly defines overlap reg.\n"); continue; @@ -123,6 +134,8 @@ bool AArch64DeadRegisterDefinitions::processMachineBasicBlock( MO.setReg(NewReg); DEBUG(MI.print(dbgs())); ++NumDeadDefsReplaced; + // Only replace one dead register, see check for zero register above. + break; } } } @@ -136,6 +149,9 @@ bool AArch64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; DEBUG(dbgs() << "***** AArch64DeadRegisterDefinitions *****\n"); + if (skipFunction(*MF.getFunction())) + return false; + for (auto &MBB : MF) if (processMachineBasicBlock(MBB)) Changed = true; diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index d24e42a937634..5e477d39e074a 100644 --- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -17,6 +17,7 @@ #include "MCTargetDesc/AArch64AddressingModes.h" #include "AArch64InstrInfo.h" #include "AArch64Subtarget.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/Support/MathExtras.h" @@ -46,9 +47,18 @@ public: private: bool expandMBB(MachineBasicBlock &MBB); - bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); + bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned BitSize); + + bool expandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + unsigned LdarOp, unsigned StlrOp, unsigned CmpOp, + unsigned ExtendImm, unsigned ZeroReg, + MachineBasicBlock::iterator &NextMBBI); + bool expandCMP_SWAP_128(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); }; char AArch64ExpandPseudo::ID = 0; } @@ -403,9 +413,17 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned BitSize) { MachineInstr &MI = *MBBI; + unsigned DstReg = MI.getOperand(0).getReg(); uint64_t Imm = MI.getOperand(1).getImm(); const unsigned Mask = 0xFFFF; + if (DstReg == AArch64::XZR || DstReg == AArch64::WZR) { + // Useless def, and we don't want to risk creating an invalid ORR (which + // would really write to sp). + MI.eraseFromParent(); + return true; + } + // Try a MOVI instruction (aka ORR-immediate with the zero register). uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); uint64_t Encoding; @@ -531,7 +549,6 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, LastShift = (TZ / 16) * 16; } unsigned Imm16 = (Imm >> Shift) & Mask; - unsigned DstReg = MI.getOperand(0).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); MachineInstrBuilder MIB1 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc)) @@ -572,10 +589,178 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, return true; } +static void addPostLoopLiveIns(MachineBasicBlock *MBB, LivePhysRegs &LiveRegs) { + for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I) + MBB->addLiveIn(*I); +} + +bool AArch64ExpandPseudo::expandCMP_SWAP( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdarOp, + unsigned StlrOp, unsigned CmpOp, unsigned ExtendImm, unsigned ZeroReg, + MachineBasicBlock::iterator &NextMBBI) { + MachineInstr &MI = *MBBI; + DebugLoc DL = MI.getDebugLoc(); + MachineOperand &Dest = MI.getOperand(0); + unsigned StatusReg = MI.getOperand(1).getReg(); + MachineOperand &Addr = MI.getOperand(2); + MachineOperand &Desired = MI.getOperand(3); + MachineOperand &New = MI.getOperand(4); + + LivePhysRegs LiveRegs(&TII->getRegisterInfo()); + LiveRegs.addLiveOuts(MBB); + for (auto I = std::prev(MBB.end()); I != MBBI; --I) + LiveRegs.stepBackward(*I); + + MachineFunction *MF = MBB.getParent(); + auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + + MF->insert(++MBB.getIterator(), LoadCmpBB); + MF->insert(++LoadCmpBB->getIterator(), StoreBB); + MF->insert(++StoreBB->getIterator(), DoneBB); + + // .Lloadcmp: + // ldaxr xDest, [xAddr] + // cmp xDest, xDesired + // b.ne .Ldone + LoadCmpBB->addLiveIn(Addr.getReg()); + LoadCmpBB->addLiveIn(Dest.getReg()); + LoadCmpBB->addLiveIn(Desired.getReg()); + addPostLoopLiveIns(LoadCmpBB, LiveRegs); + + BuildMI(LoadCmpBB, DL, TII->get(LdarOp), Dest.getReg()) + .addReg(Addr.getReg()); + BuildMI(LoadCmpBB, DL, TII->get(CmpOp), ZeroReg) + .addReg(Dest.getReg(), getKillRegState(Dest.isDead())) + .addOperand(Desired) + .addImm(ExtendImm); + BuildMI(LoadCmpBB, DL, TII->get(AArch64::Bcc)) + .addImm(AArch64CC::NE) + .addMBB(DoneBB) + .addReg(AArch64::NZCV, RegState::Implicit | RegState::Kill); + LoadCmpBB->addSuccessor(DoneBB); + LoadCmpBB->addSuccessor(StoreBB); + + // .Lstore: + // stlxr wStatus, xNew, [xAddr] + // cbnz wStatus, .Lloadcmp + StoreBB->addLiveIn(Addr.getReg()); + StoreBB->addLiveIn(New.getReg()); + addPostLoopLiveIns(StoreBB, LiveRegs); + + BuildMI(StoreBB, DL, TII->get(StlrOp), StatusReg) + .addOperand(New) + .addOperand(Addr); + BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW)) + .addReg(StatusReg, RegState::Kill) + .addMBB(LoadCmpBB); + StoreBB->addSuccessor(LoadCmpBB); + StoreBB->addSuccessor(DoneBB); + + DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end()); + DoneBB->transferSuccessors(&MBB); + addPostLoopLiveIns(DoneBB, LiveRegs); + + MBB.addSuccessor(LoadCmpBB); + + NextMBBI = MBB.end(); + MI.eraseFromParent(); + return true; +} + +bool AArch64ExpandPseudo::expandCMP_SWAP_128( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { + + MachineInstr &MI = *MBBI; + DebugLoc DL = MI.getDebugLoc(); + MachineOperand &DestLo = MI.getOperand(0); + MachineOperand &DestHi = MI.getOperand(1); + unsigned StatusReg = MI.getOperand(2).getReg(); + MachineOperand &Addr = MI.getOperand(3); + MachineOperand &DesiredLo = MI.getOperand(4); + MachineOperand &DesiredHi = MI.getOperand(5); + MachineOperand &NewLo = MI.getOperand(6); + MachineOperand &NewHi = MI.getOperand(7); + + LivePhysRegs LiveRegs(&TII->getRegisterInfo()); + LiveRegs.addLiveOuts(MBB); + for (auto I = std::prev(MBB.end()); I != MBBI; --I) + LiveRegs.stepBackward(*I); + + MachineFunction *MF = MBB.getParent(); + auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + + MF->insert(++MBB.getIterator(), LoadCmpBB); + MF->insert(++LoadCmpBB->getIterator(), StoreBB); + MF->insert(++StoreBB->getIterator(), DoneBB); + + // .Lloadcmp: + // ldaxp xDestLo, xDestHi, [xAddr] + // cmp xDestLo, xDesiredLo + // sbcs xDestHi, xDesiredHi + // b.ne .Ldone + LoadCmpBB->addLiveIn(Addr.getReg()); + LoadCmpBB->addLiveIn(DestLo.getReg()); + LoadCmpBB->addLiveIn(DestHi.getReg()); + LoadCmpBB->addLiveIn(DesiredLo.getReg()); + LoadCmpBB->addLiveIn(DesiredHi.getReg()); + addPostLoopLiveIns(LoadCmpBB, LiveRegs); + + BuildMI(LoadCmpBB, DL, TII->get(AArch64::LDAXPX)) + .addReg(DestLo.getReg(), RegState::Define) + .addReg(DestHi.getReg(), RegState::Define) + .addReg(Addr.getReg()); + BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR) + .addReg(DestLo.getReg(), getKillRegState(DestLo.isDead())) + .addOperand(DesiredLo) + .addImm(0); + BuildMI(LoadCmpBB, DL, TII->get(AArch64::SBCSXr), AArch64::XZR) + .addReg(DestHi.getReg(), getKillRegState(DestHi.isDead())) + .addOperand(DesiredHi); + BuildMI(LoadCmpBB, DL, TII->get(AArch64::Bcc)) + .addImm(AArch64CC::NE) + .addMBB(DoneBB) + .addReg(AArch64::NZCV, RegState::Implicit | RegState::Kill); + LoadCmpBB->addSuccessor(DoneBB); + LoadCmpBB->addSuccessor(StoreBB); + + // .Lstore: + // stlxp wStatus, xNewLo, xNewHi, [xAddr] + // cbnz wStatus, .Lloadcmp + StoreBB->addLiveIn(Addr.getReg()); + StoreBB->addLiveIn(NewLo.getReg()); + StoreBB->addLiveIn(NewHi.getReg()); + addPostLoopLiveIns(StoreBB, LiveRegs); + BuildMI(StoreBB, DL, TII->get(AArch64::STLXPX), StatusReg) + .addOperand(NewLo) + .addOperand(NewHi) + .addOperand(Addr); + BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW)) + .addReg(StatusReg, RegState::Kill) + .addMBB(LoadCmpBB); + StoreBB->addSuccessor(LoadCmpBB); + StoreBB->addSuccessor(DoneBB); + + DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end()); + DoneBB->transferSuccessors(&MBB); + addPostLoopLiveIns(DoneBB, LiveRegs); + + MBB.addSuccessor(LoadCmpBB); + + NextMBBI = MBB.end(); + MI.eraseFromParent(); + return true; +} + /// \brief If MBBI references a pseudo instruction that should be expanded here, /// do the expansion and return true. Otherwise return false. bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI) { + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { MachineInstr &MI = *MBBI; unsigned Opcode = MI.getOpcode(); switch (Opcode) { @@ -717,6 +902,28 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, MI.eraseFromParent(); return true; } + case AArch64::CMP_SWAP_8: + return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRB, AArch64::STLXRB, + AArch64::SUBSWrx, + AArch64_AM::getArithExtendImm(AArch64_AM::UXTB, 0), + AArch64::WZR, NextMBBI); + case AArch64::CMP_SWAP_16: + return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRH, AArch64::STLXRH, + AArch64::SUBSWrx, + AArch64_AM::getArithExtendImm(AArch64_AM::UXTH, 0), + AArch64::WZR, NextMBBI); + case AArch64::CMP_SWAP_32: + return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRW, AArch64::STLXRW, + AArch64::SUBSWrs, + AArch64_AM::getShifterImm(AArch64_AM::LSL, 0), + AArch64::WZR, NextMBBI); + case AArch64::CMP_SWAP_64: + return expandCMP_SWAP(MBB, MBBI, + AArch64::LDAXRX, AArch64::STLXRX, AArch64::SUBSXrs, + AArch64_AM::getShifterImm(AArch64_AM::LSL, 0), + AArch64::XZR, NextMBBI); + case AArch64::CMP_SWAP_128: + return expandCMP_SWAP_128(MBB, MBBI, NextMBBI); } return false; } @@ -729,7 +936,7 @@ bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) { MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); while (MBBI != E) { MachineBasicBlock::iterator NMBBI = std::next(MBBI); - Modified |= expandMI(MBB, MBBI); + Modified |= expandMI(MBB, MBBI, NMBBI); MBBI = NMBBI; } diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp index 0ac4b39b03572..e2ab7ab79be19 100644 --- a/lib/Target/AArch64/AArch64FastISel.cpp +++ b/lib/Target/AArch64/AArch64FastISel.cpp @@ -37,7 +37,6 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Operator.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/Support/CommandLine.h" using namespace llvm; namespace { @@ -144,8 +143,8 @@ private: bool computeCallAddress(const Value *V, Address &Addr); bool simplifyAddress(Address &Addr, MVT VT); void addLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB, - unsigned Flags, unsigned ScaleFactor, - MachineMemOperand *MMO); + MachineMemOperand::Flags Flags, + unsigned ScaleFactor, MachineMemOperand *MMO); bool isMemCpySmall(uint64_t Len, unsigned Alignment); bool tryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len, unsigned Alignment); @@ -439,9 +438,6 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) { .addReg(ADRPReg) .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); - } else if (OpFlags & AArch64II::MO_CONSTPOOL) { - // We can't handle addresses loaded from a constant pool quickly yet. - return 0; } else { // ADRP + ADDX BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP), @@ -555,10 +551,9 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) // Iterate through the GEP folding the constants into offsets where // we can. - gep_type_iterator GTI = gep_type_begin(U); - for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); i != e; - ++i, ++GTI) { - const Value *Op = *i; + for (gep_type_iterator GTI = gep_type_begin(U), E = gep_type_end(U); + GTI != E; ++GTI) { + const Value *Op = GTI.getOperand(); if (StructType *STy = dyn_cast<StructType>(*GTI)) { const StructLayout *SL = DL.getStructLayout(STy); unsigned Idx = cast<ConstantInt>(Op)->getZExtValue(); @@ -947,10 +942,7 @@ bool AArch64FastISel::isValueAvailable(const Value *V) const { return true; const auto *I = cast<Instruction>(V); - if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) - return true; - - return false; + return FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB; } bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) { @@ -1048,7 +1040,7 @@ bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) { void AArch64FastISel::addLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB, - unsigned Flags, + MachineMemOperand::Flags Flags, unsigned ScaleFactor, MachineMemOperand *MMO) { int64_t Offset = Addr.getOffset() / ScaleFactor; @@ -1612,8 +1604,8 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, unsigned AArch64FastISel::emitLogicalOp_ri(unsigned ISDOpc, MVT RetVT, unsigned LHSReg, bool LHSIsKill, uint64_t Imm) { - assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR) && - "ISD nodes are not consecutive!"); + static_assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR), + "ISD nodes are not consecutive!"); static const unsigned OpcTable[3][2] = { { AArch64::ANDWri, AArch64::ANDXri }, { AArch64::ORRWri, AArch64::ORRXri }, @@ -1659,8 +1651,8 @@ unsigned AArch64FastISel::emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT, unsigned LHSReg, bool LHSIsKill, unsigned RHSReg, bool RHSIsKill, uint64_t ShiftImm) { - assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR) && - "ISD nodes are not consecutive!"); + static_assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR), + "ISD nodes are not consecutive!"); static const unsigned OpcTable[3][2] = { { AArch64::ANDWrs, AArch64::ANDXrs }, { AArch64::ORRWrs, AArch64::ORRXrs }, @@ -1904,6 +1896,21 @@ bool AArch64FastISel::selectLoad(const Instruction *I) { cast<LoadInst>(I)->isAtomic()) return false; + const Value *SV = I->getOperand(0); + if (TLI.supportSwiftError()) { + // Swifterror values can come from either a function parameter with + // swifterror attribute or an alloca with swifterror attribute. + if (const Argument *Arg = dyn_cast<Argument>(SV)) { + if (Arg->hasSwiftErrorAttr()) + return false; + } + + if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) { + if (Alloca->isSwiftError()) + return false; + } + } + // See if we can handle this address. Address Addr; if (!computeAddress(I->getOperand(0), Addr, I->getType())) @@ -2068,6 +2075,21 @@ bool AArch64FastISel::selectStore(const Instruction *I) { cast<StoreInst>(I)->isAtomic()) return false; + const Value *PtrV = I->getOperand(1); + if (TLI.supportSwiftError()) { + // Swifterror values can come from either a function parameter with + // swifterror attribute or an alloca with swifterror attribute. + if (const Argument *Arg = dyn_cast<Argument>(PtrV)) { + if (Arg->hasSwiftErrorAttr()) + return false; + } + + if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) { + if (Alloca->isSwiftError()) + return false; + } + } + // Get the value to be stored into a register. Use the zero register directly // when possible to avoid an unnecessary copy and a wasted register. unsigned SrcReg = 0; @@ -2813,6 +2835,8 @@ bool AArch64FastISel::fastLowerArguments() { if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) || F->getAttributes().hasAttribute(Idx, Attribute::InReg) || F->getAttributes().hasAttribute(Idx, Attribute::StructRet) || + F->getAttributes().hasAttribute(Idx, Attribute::SwiftSelf) || + F->getAttributes().hasAttribute(Idx, Attribute::SwiftError) || F->getAttributes().hasAttribute(Idx, Attribute::Nest)) return false; @@ -3064,7 +3088,8 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) { return false; for (auto Flag : CLI.OutFlags) - if (Flag.isInReg() || Flag.isSRet() || Flag.isNest() || Flag.isByVal()) + if (Flag.isInReg() || Flag.isSRet() || Flag.isNest() || Flag.isByVal() || + Flag.isSwiftSelf() || Flag.isSwiftError()) return false; // Set up the argument vectors. @@ -3646,6 +3671,10 @@ bool AArch64FastISel::selectRet(const Instruction *I) { if (F.isVarArg()) return false; + if (TLI.supportSwiftError() && + F.getAttributes().hasAttrSomewhere(Attribute::SwiftError)) + return false; + if (TLI.supportSplitCSR(FuncInfo.MF)) return false; @@ -4814,18 +4843,18 @@ bool AArch64FastISel::selectGetElementPtr(const Instruction *I) { // Keep a running tab of the total offset to coalesce multiple N = N + Offset // into a single N = N + TotalOffset. uint64_t TotalOffs = 0; - Type *Ty = I->getOperand(0)->getType(); MVT VT = TLI.getPointerTy(DL); - for (auto OI = std::next(I->op_begin()), E = I->op_end(); OI != E; ++OI) { - const Value *Idx = *OI; - if (auto *StTy = dyn_cast<StructType>(Ty)) { + for (gep_type_iterator GTI = gep_type_begin(I), E = gep_type_end(I); + GTI != E; ++GTI) { + const Value *Idx = GTI.getOperand(); + if (auto *StTy = dyn_cast<StructType>(*GTI)) { unsigned Field = cast<ConstantInt>(Idx)->getZExtValue(); // N = N + Offset if (Field) TotalOffs += DL.getStructLayout(StTy)->getElementOffset(Field); - Ty = StTy->getElementType(Field); } else { - Ty = cast<SequentialType>(Ty)->getElementType(); + Type *Ty = GTI.getIndexedType(); + // If this is a constant subscript, handle it quickly. if (const auto *CI = dyn_cast<ConstantInt>(Idx)) { if (CI->isZero()) diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index 3f63d049c34ed..82111e5c72593 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -93,6 +93,7 @@ #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -127,12 +128,7 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); unsigned NumBytes = AFI->getLocalStackSize(); - // Note: currently hasFP() is always true for hasCalls(), but that's an - // implementation detail of the current code, not a strict requirement, - // so stay safe here and check both. - if (MFI->hasCalls() || hasFP(MF) || NumBytes > 128) - return false; - return true; + return !(MFI->hasCalls() || hasFP(MF) || NumBytes > 128); } /// hasFP - Return true if the specified function should have a dedicated frame @@ -140,9 +136,12 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); - return (MFI->hasCalls() || MFI->hasVarSizedObjects() || - MFI->isFrameAddressTaken() || MFI->hasStackMap() || - MFI->hasPatchPoint() || RegInfo->needsStackRealignment(MF)); + // Retain behavior of always omitting the FP for leaf functions when possible. + return (MFI->hasCalls() && + MF.getTarget().Options.DisableFramePointerElim(MF)) || + MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken() || + MFI->hasStackMap() || MFI->hasPatchPoint() || + RegInfo->needsStackRealignment(MF); } /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is @@ -155,7 +154,7 @@ AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { return !MF.getFrameInfo()->hasVarSizedObjects(); } -void AArch64FrameLowering::eliminateCallFramePseudoInstr( +MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { const AArch64InstrInfo *TII = @@ -170,7 +169,7 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr( unsigned Align = getStackAlignment(); int64_t Amount = I->getOperand(0).getImm(); - Amount = RoundUpToAlignment(Amount, Align); + Amount = alignTo(Amount, Align); if (!IsDestroy) Amount = -Amount; @@ -186,7 +185,7 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr( // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses // LSL #0, and the other uses LSL #12. // - // Mostly call frames will be allocated at the start of a function so + // Most call frames will be allocated at the start of a function so // this is OK, but it is a limitation that needs dealing with. assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large"); emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, Amount, TII); @@ -198,12 +197,11 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr( emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, -CalleePopAmount, TII); } - MBB.erase(I); + return MBB.erase(I); } void AArch64FrameLowering::emitCalleeSavedFrameMoves( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - unsigned FramePtr) const { + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo *MFI = MF.getFrameInfo(); MachineModuleInfo &MMI = MF.getMMI(); @@ -216,75 +214,194 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves( if (CSI.empty()) return; - const DataLayout &TD = MF.getDataLayout(); - bool HasFP = hasFP(MF); - - // Calculate amount of bytes used for return address storing. - int stackGrowth = -TD.getPointerSize(0); - - // Calculate offsets. - int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth; - unsigned TotalSkipped = 0; for (const auto &Info : CSI) { unsigned Reg = Info.getReg(); - int64_t Offset = MFI->getObjectOffset(Info.getFrameIdx()) - - getOffsetOfLocalArea() + saveAreaOffset; - - // Don't output a new CFI directive if we're re-saving the frame pointer or - // link register. This happens when the PrologEpilogInserter has inserted an - // extra "STP" of the frame pointer and link register -- the "emitPrologue" - // method automatically generates the directives when frame pointers are - // used. If we generate CFI directives for the extra "STP"s, the linker will - // lose track of the correct values for the frame pointer and link register. - if (HasFP && (FramePtr == Reg || Reg == AArch64::LR)) { - TotalSkipped += stackGrowth; - continue; - } - + int64_t Offset = + MFI->getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea(); unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); - unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset( - nullptr, DwarfReg, Offset - TotalSkipped)); + unsigned CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); } } -/// Get FPOffset by analyzing the first instruction. -static int getFPOffsetInPrologue(MachineInstr *MBBI) { - // First instruction must a) allocate the stack and b) have an immediate - // that is a multiple of -2. - assert(((MBBI->getOpcode() == AArch64::STPXpre || - MBBI->getOpcode() == AArch64::STPDpre) && - MBBI->getOperand(3).getReg() == AArch64::SP && - MBBI->getOperand(4).getImm() < 0 && - (MBBI->getOperand(4).getImm() & 1) == 0)); - - // Frame pointer is fp = sp - 16. Since the STPXpre subtracts the space - // required for the callee saved register area we get the frame pointer - // by addding that offset - 16 = -getImm()*8 - 2*8 = -(getImm() + 2) * 8. - int FPOffset = -(MBBI->getOperand(4).getImm() + 2) * 8; - assert(FPOffset >= 0 && "Bad Framepointer Offset"); - return FPOffset; -} +// Find a scratch register that we can use at the start of the prologue to +// re-align the stack pointer. We avoid using callee-save registers since they +// may appear to be free when this is called from canUseAsPrologue (during +// shrink wrapping), but then no longer be free when this is called from +// emitPrologue. +// +// FIXME: This is a bit conservative, since in the above case we could use one +// of the callee-save registers as a scratch temp to re-align the stack pointer, +// but we would then have to make sure that we were in fact saving at least one +// callee-save register in the prologue, which is additional complexity that +// doesn't seem worth the benefit. +static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) { + MachineFunction *MF = MBB->getParent(); + + // If MBB is an entry block, use X9 as the scratch register + if (&MF->front() == MBB) + return AArch64::X9; + + const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo(); + LivePhysRegs LiveRegs(&TRI); + LiveRegs.addLiveIns(*MBB); + + // Mark callee saved registers as used so we will not choose them. + const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>(); + const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(MF); + for (unsigned i = 0; CSRegs[i]; ++i) + LiveRegs.addReg(CSRegs[i]); + + // Prefer X9 since it was historically used for the prologue scratch reg. + const MachineRegisterInfo &MRI = MF->getRegInfo(); + if (LiveRegs.available(MRI, AArch64::X9)) + return AArch64::X9; -static bool isCSSave(MachineInstr *MBBI) { - return MBBI->getOpcode() == AArch64::STPXi || - MBBI->getOpcode() == AArch64::STPDi || - MBBI->getOpcode() == AArch64::STPXpre || - MBBI->getOpcode() == AArch64::STPDpre; + for (unsigned Reg : AArch64::GPR64RegClass) { + if (LiveRegs.available(MRI, Reg)) + return Reg; + } + return AArch64::NoRegister; } bool AArch64FrameLowering::canUseAsPrologue( const MachineBasicBlock &MBB) const { const MachineFunction *MF = MBB.getParent(); + MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB); const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>(); const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); // Don't need a scratch register if we're not going to re-align the stack. - // Otherwise, we may need a scratch register to be available and we do not - // support that for now. - return !RegInfo->needsStackRealignment(*MF); + if (!RegInfo->needsStackRealignment(*MF)) + return true; + // Otherwise, we can use any block as long as it has a scratch register + // available. + return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister; +} + +bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( + MachineFunction &MF, unsigned StackBumpBytes) const { + AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + + if (AFI->getLocalStackSize() == 0) + return false; + + // 512 is the maximum immediate for stp/ldp that will be used for + // callee-save save/restores + if (StackBumpBytes >= 512) + return false; + + if (MFI->hasVarSizedObjects()) + return false; + + if (RegInfo->needsStackRealignment(MF)) + return false; + + // This isn't strictly necessary, but it simplifies things a bit since the + // current RedZone handling code assumes the SP is adjusted by the + // callee-save save/restore code. + if (canUseRedZone(MF)) + return false; + + return true; +} + +// Convert callee-save register save/restore instruction to do stack pointer +// decrement/increment to allocate/deallocate the callee-save stack area by +// converting store/load to use pre/post increment version. +static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc) { + + unsigned NewOpc; + bool NewIsUnscaled = false; + switch (MBBI->getOpcode()) { + default: + llvm_unreachable("Unexpected callee-save save/restore opcode!"); + case AArch64::STPXi: + NewOpc = AArch64::STPXpre; + break; + case AArch64::STPDi: + NewOpc = AArch64::STPDpre; + break; + case AArch64::STRXui: + NewOpc = AArch64::STRXpre; + NewIsUnscaled = true; + break; + case AArch64::STRDui: + NewOpc = AArch64::STRDpre; + NewIsUnscaled = true; + break; + case AArch64::LDPXi: + NewOpc = AArch64::LDPXpost; + break; + case AArch64::LDPDi: + NewOpc = AArch64::LDPDpost; + break; + case AArch64::LDRXui: + NewOpc = AArch64::LDRXpost; + NewIsUnscaled = true; + break; + case AArch64::LDRDui: + NewOpc = AArch64::LDRDpost; + NewIsUnscaled = true; + break; + } + + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc)); + MIB.addReg(AArch64::SP, RegState::Define); + + // Copy all operands other than the immediate offset. + unsigned OpndIdx = 0; + for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd; + ++OpndIdx) + MIB.addOperand(MBBI->getOperand(OpndIdx)); + + assert(MBBI->getOperand(OpndIdx).getImm() == 0 && + "Unexpected immediate offset in first/last callee-save save/restore " + "instruction!"); + assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP && + "Unexpected base register in callee-save save/restore instruction!"); + // Last operand is immediate offset that needs fixing. + assert(CSStackSizeInc % 8 == 0); + int64_t CSStackSizeIncImm = CSStackSizeInc; + if (!NewIsUnscaled) + CSStackSizeIncImm /= 8; + MIB.addImm(CSStackSizeIncImm); + + MIB.setMIFlags(MBBI->getFlags()); + MIB.setMemRefs(MBBI->memoperands_begin(), MBBI->memoperands_end()); + + return std::prev(MBB.erase(MBBI)); +} + +// Fixup callee-save register save/restore instructions to take into account +// combined SP bump by adding the local stack size to the stack offsets. +static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI, + unsigned LocalStackSize) { + unsigned Opc = MI.getOpcode(); + (void)Opc; + assert((Opc == AArch64::STPXi || Opc == AArch64::STPDi || + Opc == AArch64::STRXui || Opc == AArch64::STRDui || + Opc == AArch64::LDPXi || Opc == AArch64::LDPDi || + Opc == AArch64::LDRXui || Opc == AArch64::LDRDui) && + "Unexpected callee-save save/restore opcode!"); + + unsigned OffsetIdx = MI.getNumExplicitOperands() - 1; + assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP && + "Unexpected base register in callee-save save/restore instruction!"); + // Last operand is immediate offset that needs fixing. + MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx); + // All generated opcodes have scaled offsets. + assert(LocalStackSize % 8 == 0); + OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / 8); } void AArch64FrameLowering::emitPrologue(MachineFunction &MF, @@ -316,40 +433,59 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // All of the stack allocation is for locals. AFI->setLocalStackSize(NumBytes); - // Label used to tie together the PROLOG_LABEL and the MachineMoves. - MCSymbol *FrameLabel = MMI.getContext().createTempSymbol(); - + if (!NumBytes) + return; // REDZONE: If the stack size is less than 128 bytes, we don't need // to actually allocate. - if (NumBytes && !canUseRedZone(MF)) { + if (canUseRedZone(MF)) + ++NumRedZoneFunctions; + else { emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII, MachineInstr::FrameSetup); + // Label used to tie together the PROLOG_LABEL and the MachineMoves. + MCSymbol *FrameLabel = MMI.getContext().createTempSymbol(); // Encode the stack size of the leaf function. unsigned CFIIndex = MMI.addFrameInst( MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); - } else if (NumBytes) { - ++NumRedZoneFunctions; } - return; } - // Only set up FP if we actually need to. - int FPOffset = 0; - if (HasFP) - FPOffset = getFPOffsetInPrologue(MBBI); + auto CSStackSize = AFI->getCalleeSavedStackSize(); + // All of the remaining stack allocations are for locals. + AFI->setLocalStackSize(NumBytes - CSStackSize); - // Move past the saves of the callee-saved registers. - while (isCSSave(MBBI)) { - ++MBBI; - NumBytes -= 16; + bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); + if (CombineSPBump) { + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII, + MachineInstr::FrameSetup); + NumBytes = 0; + } else if (CSStackSize != 0) { + MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(MBB, MBBI, DL, TII, + -CSStackSize); + NumBytes -= CSStackSize; } assert(NumBytes >= 0 && "Negative stack allocation size!?"); + + // Move past the saves of the callee-saved registers, fixing up the offsets + // and pre-inc if we decided to combine the callee-save and local stack + // pointer bump above. + MachineBasicBlock::iterator End = MBB.end(); + while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup)) { + if (CombineSPBump) + fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize()); + ++MBBI; + } if (HasFP) { + // Only set up FP if we actually need to. Frame pointer is fp = sp - 16. + int FPOffset = CSStackSize - 16; + if (CombineSPBump) + FPOffset += AFI->getLocalStackSize(); + // Issue sub fp, sp, FPOffset or // mov fp,sp when FPOffset is zero. // Note: All stores of callee-saved registers are marked as "FrameSetup". @@ -358,47 +494,46 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, MachineInstr::FrameSetup); } - // All of the remaining stack allocations are for locals. - AFI->setLocalStackSize(NumBytes); - // Allocate space for the rest of the frame. + if (NumBytes) { + const bool NeedsRealignment = RegInfo->needsStackRealignment(MF); + unsigned scratchSPReg = AArch64::SP; - const unsigned Alignment = MFI->getMaxAlignment(); - const bool NeedsRealignment = RegInfo->needsStackRealignment(MF); - unsigned scratchSPReg = AArch64::SP; - if (NumBytes && NeedsRealignment) { - // Use the first callee-saved register as a scratch register. - scratchSPReg = AArch64::X9; - } + if (NeedsRealignment) { + scratchSPReg = findScratchNonCalleeSaveRegister(&MBB); + assert(scratchSPReg != AArch64::NoRegister); + } - // If we're a leaf function, try using the red zone. - if (NumBytes && !canUseRedZone(MF)) - // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have - // the correct value here, as NumBytes also includes padding bytes, - // which shouldn't be counted here. - emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII, - MachineInstr::FrameSetup); + // If we're a leaf function, try using the red zone. + if (!canUseRedZone(MF)) + // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have + // the correct value here, as NumBytes also includes padding bytes, + // which shouldn't be counted here. + emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII, + MachineInstr::FrameSetup); - if (NumBytes && NeedsRealignment) { - const unsigned NrBitsToZero = countTrailingZeros(Alignment); - assert(NrBitsToZero > 1); - assert(scratchSPReg != AArch64::SP); - - // SUB X9, SP, NumBytes - // -- X9 is temporary register, so shouldn't contain any live data here, - // -- free to use. This is already produced by emitFrameOffset above. - // AND SP, X9, 0b11111...0000 - // The logical immediates have a non-trivial encoding. The following - // formula computes the encoded immediate with all ones but - // NrBitsToZero zero bits as least significant bits. - uint32_t andMaskEncoded = - (1 <<12) // = N - | ((64-NrBitsToZero) << 6) // immr - | ((64-NrBitsToZero-1) << 0) // imms - ; - BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP) - .addReg(scratchSPReg, RegState::Kill) - .addImm(andMaskEncoded); + if (NeedsRealignment) { + const unsigned Alignment = MFI->getMaxAlignment(); + const unsigned NrBitsToZero = countTrailingZeros(Alignment); + assert(NrBitsToZero > 1); + assert(scratchSPReg != AArch64::SP); + + // SUB X9, SP, NumBytes + // -- X9 is temporary register, so shouldn't contain any live data here, + // -- free to use. This is already produced by emitFrameOffset above. + // AND SP, X9, 0b11111...0000 + // The logical immediates have a non-trivial encoding. The following + // formula computes the encoded immediate with all ones but + // NrBitsToZero zero bits as least significant bits. + uint32_t andMaskEncoded = (1 << 12) // = N + | ((64 - NrBitsToZero) << 6) // immr + | ((64 - NrBitsToZero - 1) << 0); // imms + + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP) + .addReg(scratchSPReg, RegState::Kill) + .addImm(andMaskEncoded); + AFI->setStackRealigned(true); + } } // If we need a base pointer, set it up here. It's whatever the value of the @@ -491,21 +626,6 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); - - // Record the location of the stored LR - unsigned LR = RegInfo->getDwarfRegNum(AArch64::LR, true); - CFIIndex = MMI.addFrameInst( - MCCFIInstruction::createOffset(nullptr, LR, StackGrowth)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); - - // Record the location of the stored FP - CFIIndex = MMI.addFrameInst( - MCCFIInstruction::createOffset(nullptr, Reg, 2 * StackGrowth)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); } else { // Encode the stack size of the leaf function. unsigned CFIIndex = MMI.addFrameInst( @@ -515,36 +635,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, .setMIFlags(MachineInstr::FrameSetup); } - // Now emit the moves for whatever callee saved regs we have. - emitCalleeSavedFrameMoves(MBB, MBBI, FramePtr); - } -} - -static bool isCalleeSavedRegister(unsigned Reg, const MCPhysReg *CSRegs) { - for (unsigned i = 0; CSRegs[i]; ++i) - if (Reg == CSRegs[i]) - return true; - return false; -} - -/// Checks whether the given instruction restores callee save registers -/// and if so returns how many. -static unsigned getNumCSRestores(MachineInstr &MI, const MCPhysReg *CSRegs) { - unsigned RtIdx = 0; - switch (MI.getOpcode()) { - case AArch64::LDPXpost: - case AArch64::LDPDpost: - RtIdx = 1; - // FALLTHROUGH - case AArch64::LDPXi: - case AArch64::LDPDi: - if (!isCalleeSavedRegister(MI.getOperand(RtIdx).getReg(), CSRegs) || - !isCalleeSavedRegister(MI.getOperand(RtIdx + 1).getReg(), CSRegs) || - MI.getOperand(RtIdx + 2).getReg() != AArch64::SP) - return 0; - return 2; + // Now emit the moves for whatever callee saved regs we have (including FP, + // LR if those are saved). + emitCalleeSavedFrameMoves(MBB, MBBI); } - return 0; } void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, @@ -552,7 +646,6 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); MachineFrameInfo *MFI = MF.getFrameInfo(); const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); - const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL; bool IsTailCallReturn = false; @@ -599,7 +692,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // ---------------------| --- | // | | | | // | CalleeSavedReg | | | - // | (NumRestores * 8) | | | + // | (CalleeSavedStackSize)| | | // | | | | // ---------------------| | NumBytes // | | StackSize (StackAdjustUp) @@ -614,41 +707,74 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps // it as the 2nd argument of AArch64ISD::TC_RETURN. - NumBytes += ArgumentPopSize; - unsigned NumRestores = 0; + auto CSStackSize = AFI->getCalleeSavedStackSize(); + bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); + + if (!CombineSPBump && CSStackSize != 0) + convertCalleeSaveRestoreToSPPrePostIncDec( + MBB, std::prev(MBB.getFirstTerminator()), DL, TII, CSStackSize); + // Move past the restores of the callee-saved registers. MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator(); - const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); MachineBasicBlock::iterator Begin = MBB.begin(); while (LastPopI != Begin) { --LastPopI; - unsigned Restores = getNumCSRestores(*LastPopI, CSRegs); - NumRestores += Restores; - if (Restores == 0) { + if (!LastPopI->getFlag(MachineInstr::FrameDestroy)) { ++LastPopI; break; - } + } else if (CombineSPBump) + fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize()); + } + + // If there is a single SP update, insert it before the ret and we're done. + if (CombineSPBump) { + emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, + NumBytes + ArgumentPopSize, TII, + MachineInstr::FrameDestroy); + return; } - NumBytes -= NumRestores * 8; + + NumBytes -= CSStackSize; assert(NumBytes >= 0 && "Negative stack allocation size!?"); if (!hasFP(MF)) { + bool RedZone = canUseRedZone(MF); // If this was a redzone leaf function, we don't need to restore the - // stack pointer. - if (!canUseRedZone(MF)) - emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, - TII); - return; + // stack pointer (but we may need to pop stack args for fastcc). + if (RedZone && ArgumentPopSize == 0) + return; + + bool NoCalleeSaveRestore = CSStackSize == 0; + int StackRestoreBytes = RedZone ? 0 : NumBytes; + if (NoCalleeSaveRestore) + StackRestoreBytes += ArgumentPopSize; + emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, + StackRestoreBytes, TII, MachineInstr::FrameDestroy); + // If we were able to combine the local stack pop with the argument pop, + // then we're done. + if (NoCalleeSaveRestore || ArgumentPopSize == 0) + return; + NumBytes = 0; } // Restore the original stack pointer. // FIXME: Rather than doing the math here, we should instead just use // non-post-indexed loads for the restores if we aren't actually going to // be able to save any instructions. - if (NumBytes || MFI->hasVarSizedObjects()) + if (MFI->hasVarSizedObjects() || AFI->isStackRealigned()) emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP, - -(NumRestores - 2) * 8, TII, MachineInstr::NoFlags); + -CSStackSize + 16, TII, MachineInstr::FrameDestroy); + else if (NumBytes) + emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, TII, + MachineInstr::FrameDestroy); + + // This must be placed after the callee-save restore code because that code + // assumes the SP is at the same location as it was after the callee-save save + // code in the prologue. + if (ArgumentPopSize) + emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, + ArgumentPopSize, TII, MachineInstr::FrameDestroy); } /// getFrameIndexReference - Provide a base+offset reference to an FI slot for @@ -726,86 +852,167 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF, } static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) { - if (Reg != AArch64::LR) - return getKillRegState(true); + // Do not set a kill flag on values that are also marked as live-in. This + // happens with the @llvm-returnaddress intrinsic and with arguments passed in + // callee saved registers. + // Omitting the kill flags is conservatively correct even if the live-in + // is not used after all. + bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg); + return getKillRegState(!IsLiveIn); +} - // LR maybe referred to later by an @llvm.returnaddress intrinsic. - bool LRLiveIn = MF.getRegInfo().isLiveIn(AArch64::LR); - bool LRKill = !(LRLiveIn && MF.getFrameInfo()->isReturnAddressTaken()); - return getKillRegState(LRKill); +static bool produceCompactUnwindFrame(MachineFunction &MF) { + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + AttributeSet Attrs = MF.getFunction()->getAttributes(); + return Subtarget.isTargetMachO() && + !(Subtarget.getTargetLowering()->supportSwiftError() && + Attrs.hasAttrSomewhere(Attribute::SwiftError)); } -bool AArch64FrameLowering::spillCalleeSavedRegisters( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const { - MachineFunction &MF = *MBB.getParent(); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + +struct RegPairInfo { + RegPairInfo() : Reg1(AArch64::NoRegister), Reg2(AArch64::NoRegister) {} + unsigned Reg1; + unsigned Reg2; + int FrameIdx; + int Offset; + bool IsGPR; + bool isPaired() const { return Reg2 != AArch64::NoRegister; } +}; + +static void computeCalleeSaveRegisterPairs( + MachineFunction &MF, const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs) { + + if (CSI.empty()) + return; + + AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + CallingConv::ID CC = MF.getFunction()->getCallingConv(); unsigned Count = CSI.size(); - DebugLoc DL; - assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); + (void)CC; + // MachO's compact unwind format relies on all registers being stored in + // pairs. + assert((!produceCompactUnwindFrame(MF) || + CC == CallingConv::PreserveMost || + (Count & 1) == 0) && + "Odd number of callee-saved regs to spill!"); + unsigned Offset = AFI->getCalleeSavedStackSize(); + + for (unsigned i = 0; i < Count; ++i) { + RegPairInfo RPI; + RPI.Reg1 = CSI[i].getReg(); + + assert(AArch64::GPR64RegClass.contains(RPI.Reg1) || + AArch64::FPR64RegClass.contains(RPI.Reg1)); + RPI.IsGPR = AArch64::GPR64RegClass.contains(RPI.Reg1); + + // Add the next reg to the pair if it is in the same register class. + if (i + 1 < Count) { + unsigned NextReg = CSI[i + 1].getReg(); + if ((RPI.IsGPR && AArch64::GPR64RegClass.contains(NextReg)) || + (!RPI.IsGPR && AArch64::FPR64RegClass.contains(NextReg))) + RPI.Reg2 = NextReg; + } - for (unsigned i = 0; i < Count; i += 2) { - unsigned idx = Count - i - 2; - unsigned Reg1 = CSI[idx].getReg(); - unsigned Reg2 = CSI[idx + 1].getReg(); // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI // list to come in sorted by frame index so that we can issue the store // pair instructions directly. Assert if we see anything otherwise. // // The order of the registers in the list is controlled by // getCalleeSavedRegs(), so they will always be in-order, as well. - assert(CSI[idx].getFrameIdx() + 1 == CSI[idx + 1].getFrameIdx() && + assert((!RPI.isPaired() || + (CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx())) && "Out of order callee saved regs!"); + + // MachO's compact unwind format relies on all registers being stored in + // adjacent register pairs. + assert((!produceCompactUnwindFrame(MF) || + CC == CallingConv::PreserveMost || + (RPI.isPaired() && + ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) || + RPI.Reg1 + 1 == RPI.Reg2))) && + "Callee-save registers not saved as adjacent register pair!"); + + RPI.FrameIdx = CSI[i].getFrameIdx(); + + if (Count * 8 != AFI->getCalleeSavedStackSize() && !RPI.isPaired()) { + // Round up size of non-pair to pair size if we need to pad the + // callee-save area to ensure 16-byte alignment. + Offset -= 16; + assert(MFI->getObjectAlignment(RPI.FrameIdx) <= 16); + MFI->setObjectAlignment(RPI.FrameIdx, 16); + AFI->setCalleeSaveStackHasFreeSpace(true); + } else + Offset -= RPI.isPaired() ? 16 : 8; + assert(Offset % 8 == 0); + RPI.Offset = Offset / 8; + assert((RPI.Offset >= -64 && RPI.Offset <= 63) && + "Offset out of bounds for LDP/STP immediate"); + + RegPairs.push_back(RPI); + if (RPI.isPaired()) + ++i; + } +} + +bool AArch64FrameLowering::spillCalleeSavedRegisters( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + DebugLoc DL; + SmallVector<RegPairInfo, 8> RegPairs; + + computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs); + + for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE; + ++RPII) { + RegPairInfo RPI = *RPII; + unsigned Reg1 = RPI.Reg1; + unsigned Reg2 = RPI.Reg2; unsigned StrOpc; - assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); - assert((i & 1) == 0 && "Odd index for callee-saved reg spill!"); - // Issue sequence of non-sp increment and pi sp spills for cs regs. The - // first spill is a pre-increment that allocates the stack. + + // Issue sequence of spills for cs regs. The first spill may be converted + // to a pre-decrement store later by emitPrologue if the callee-save stack + // area allocation can't be combined with the local stack area allocation. // For example: - // stp x22, x21, [sp, #-48]! // addImm(-6) + // stp x22, x21, [sp, #0] // addImm(+0) // stp x20, x19, [sp, #16] // addImm(+2) // stp fp, lr, [sp, #32] // addImm(+4) // Rationale: This sequence saves uop updates compared to a sequence of // pre-increment spills like stp xi,xj,[sp,#-16]! - // Note: Similar rational and sequence for restores in epilog. - if (AArch64::GPR64RegClass.contains(Reg1)) { - assert(AArch64::GPR64RegClass.contains(Reg2) && - "Expected GPR64 callee-saved register pair!"); - // For first spill use pre-increment store. - if (i == 0) - StrOpc = AArch64::STPXpre; - else - StrOpc = AArch64::STPXi; - } else if (AArch64::FPR64RegClass.contains(Reg1)) { - assert(AArch64::FPR64RegClass.contains(Reg2) && - "Expected FPR64 callee-saved register pair!"); - // For first spill use pre-increment store. - if (i == 0) - StrOpc = AArch64::STPDpre; - else - StrOpc = AArch64::STPDi; - } else - llvm_unreachable("Unexpected callee saved register!"); - DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1) << ", " - << TRI->getName(Reg2) << ") -> fi#(" << CSI[idx].getFrameIdx() - << ", " << CSI[idx + 1].getFrameIdx() << ")\n"); - // Compute offset: i = 0 => offset = -Count; - // i = 2 => offset = -(Count - 2) + Count = 2 = i; etc. - const int Offset = (i == 0) ? -Count : i; - assert((Offset >= -64 && Offset <= 63) && - "Offset out of bounds for STP immediate"); - MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc)); - if (StrOpc == AArch64::STPDpre || StrOpc == AArch64::STPXpre) - MIB.addReg(AArch64::SP, RegState::Define); + // Note: Similar rationale and sequence for restores in epilog. + if (RPI.IsGPR) + StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui; + else + StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui; + DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1); + if (RPI.isPaired()) + dbgs() << ", " << TRI->getName(Reg2); + dbgs() << ") -> fi#(" << RPI.FrameIdx; + if (RPI.isPaired()) + dbgs() << ", " << RPI.FrameIdx+1; + dbgs() << ")\n"); + MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc)); MBB.addLiveIn(Reg1); - MBB.addLiveIn(Reg2); - MIB.addReg(Reg2, getPrologueDeath(MF, Reg2)) - .addReg(Reg1, getPrologueDeath(MF, Reg1)) + if (RPI.isPaired()) { + MBB.addLiveIn(Reg2); + MIB.addReg(Reg2, getPrologueDeath(MF, Reg2)); + MIB.addMemOperand(MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1), + MachineMemOperand::MOStore, 8, 8)); + } + MIB.addReg(Reg1, getPrologueDeath(MF, Reg1)) .addReg(AArch64::SP) - .addImm(Offset) // [sp, #offset * 8], where factor * 8 is implicit + .addImm(RPI.Offset) // [sp, #offset*8], where factor*8 is implicit .setMIFlag(MachineInstr::FrameSetup); + MIB.addMemOperand(MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx), + MachineMemOperand::MOStore, 8, 8)); } return true; } @@ -816,66 +1023,55 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - unsigned Count = CSI.size(); DebugLoc DL; - assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); + SmallVector<RegPairInfo, 8> RegPairs; if (MI != MBB.end()) DL = MI->getDebugLoc(); - for (unsigned i = 0; i < Count; i += 2) { - unsigned Reg1 = CSI[i].getReg(); - unsigned Reg2 = CSI[i + 1].getReg(); - // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI - // list to come in sorted by frame index so that we can issue the store - // pair instructions directly. Assert if we see anything otherwise. - assert(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx() && - "Out of order callee saved regs!"); - // Issue sequence of non-sp increment and sp-pi restores for cs regs. Only - // the last load is sp-pi post-increment and de-allocates the stack: + computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs); + + for (auto RPII = RegPairs.begin(), RPIE = RegPairs.end(); RPII != RPIE; + ++RPII) { + RegPairInfo RPI = *RPII; + unsigned Reg1 = RPI.Reg1; + unsigned Reg2 = RPI.Reg2; + + // Issue sequence of restores for cs regs. The last restore may be converted + // to a post-increment load later by emitEpilogue if the callee-save stack + // area allocation can't be combined with the local stack area allocation. // For example: // ldp fp, lr, [sp, #32] // addImm(+4) // ldp x20, x19, [sp, #16] // addImm(+2) - // ldp x22, x21, [sp], #48 // addImm(+6) + // ldp x22, x21, [sp, #0] // addImm(+0) // Note: see comment in spillCalleeSavedRegisters() unsigned LdrOpc; + if (RPI.IsGPR) + LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui; + else + LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui; + DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1); + if (RPI.isPaired()) + dbgs() << ", " << TRI->getName(Reg2); + dbgs() << ") -> fi#(" << RPI.FrameIdx; + if (RPI.isPaired()) + dbgs() << ", " << RPI.FrameIdx+1; + dbgs() << ")\n"); - assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); - assert((i & 1) == 0 && "Odd index for callee-saved reg spill!"); - if (AArch64::GPR64RegClass.contains(Reg1)) { - assert(AArch64::GPR64RegClass.contains(Reg2) && - "Expected GPR64 callee-saved register pair!"); - if (i == Count - 2) - LdrOpc = AArch64::LDPXpost; - else - LdrOpc = AArch64::LDPXi; - } else if (AArch64::FPR64RegClass.contains(Reg1)) { - assert(AArch64::FPR64RegClass.contains(Reg2) && - "Expected FPR64 callee-saved register pair!"); - if (i == Count - 2) - LdrOpc = AArch64::LDPDpost; - else - LdrOpc = AArch64::LDPDi; - } else - llvm_unreachable("Unexpected callee saved register!"); - DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1) << ", " - << TRI->getName(Reg2) << ") -> fi#(" << CSI[i].getFrameIdx() - << ", " << CSI[i + 1].getFrameIdx() << ")\n"); - - // Compute offset: i = 0 => offset = Count - 2; i = 2 => offset = Count - 4; - // etc. - const int Offset = (i == Count - 2) ? Count : Count - i - 2; - assert((Offset >= -64 && Offset <= 63) && - "Offset out of bounds for LDP immediate"); MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc)); - if (LdrOpc == AArch64::LDPXpost || LdrOpc == AArch64::LDPDpost) - MIB.addReg(AArch64::SP, RegState::Define); - - MIB.addReg(Reg2, getDefRegState(true)) - .addReg(Reg1, getDefRegState(true)) + if (RPI.isPaired()) { + MIB.addReg(Reg2, getDefRegState(true)); + MIB.addMemOperand(MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1), + MachineMemOperand::MOLoad, 8, 8)); + } + MIB.addReg(Reg1, getDefRegState(true)) .addReg(AArch64::SP) - .addImm(Offset); // [sp], #offset * 8 or [sp, #offset * 8] - // where the factor * 8 is implicit + .addImm(RPI.Offset) // [sp, #offset*8] where the factor*8 is implicit + .setMIFlag(MachineInstr::FrameDestroy); + MIB.addMemOperand(MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx), + MachineMemOperand::MOLoad, 8, 8)); } return true; } @@ -892,8 +1088,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( MF.getSubtarget().getRegisterInfo()); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); - SmallVector<unsigned, 4> UnspilledCSGPRs; - SmallVector<unsigned, 4> UnspilledCSFPRs; + unsigned UnspilledCSGPR = AArch64::NoRegister; + unsigned UnspilledCSGPRPaired = AArch64::NoRegister; // The frame record needs to be created by saving the appropriate registers if (hasFP(MF)) { @@ -901,79 +1097,51 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, SavedRegs.set(AArch64::LR); } - // Spill the BasePtr if it's used. Do this first thing so that the - // getCalleeSavedRegs() below will get the right answer. + unsigned BasePointerReg = AArch64::NoRegister; if (RegInfo->hasBasePointer(MF)) - SavedRegs.set(RegInfo->getBaseRegister()); - - if (RegInfo->needsStackRealignment(MF) && !RegInfo->hasBasePointer(MF)) - SavedRegs.set(AArch64::X9); + BasePointerReg = RegInfo->getBaseRegister(); - // If any callee-saved registers are used, the frame cannot be eliminated. - unsigned NumGPRSpilled = 0; - unsigned NumFPRSpilled = 0; bool ExtraCSSpill = false; - bool CanEliminateFrame = true; - DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:"); const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); + // Figure out which callee-saved registers to save/restore. + for (unsigned i = 0; CSRegs[i]; ++i) { + const unsigned Reg = CSRegs[i]; + + // Add the base pointer register to SavedRegs if it is callee-save. + if (Reg == BasePointerReg) + SavedRegs.set(Reg); - // Check pairs of consecutive callee-saved registers. - for (unsigned i = 0; CSRegs[i]; i += 2) { - assert(CSRegs[i + 1] && "Odd number of callee-saved registers!"); - - const unsigned OddReg = CSRegs[i]; - const unsigned EvenReg = CSRegs[i + 1]; - assert((AArch64::GPR64RegClass.contains(OddReg) && - AArch64::GPR64RegClass.contains(EvenReg)) ^ - (AArch64::FPR64RegClass.contains(OddReg) && - AArch64::FPR64RegClass.contains(EvenReg)) && - "Register class mismatch!"); - - const bool OddRegUsed = SavedRegs.test(OddReg); - const bool EvenRegUsed = SavedRegs.test(EvenReg); - - // Early exit if none of the registers in the register pair is actually - // used. - if (!OddRegUsed && !EvenRegUsed) { - if (AArch64::GPR64RegClass.contains(OddReg)) { - UnspilledCSGPRs.push_back(OddReg); - UnspilledCSGPRs.push_back(EvenReg); - } else { - UnspilledCSFPRs.push_back(OddReg); - UnspilledCSFPRs.push_back(EvenReg); + bool RegUsed = SavedRegs.test(Reg); + unsigned PairedReg = CSRegs[i ^ 1]; + if (!RegUsed) { + if (AArch64::GPR64RegClass.contains(Reg) && + !RegInfo->isReservedReg(MF, Reg)) { + UnspilledCSGPR = Reg; + UnspilledCSGPRPaired = PairedReg; } continue; } - unsigned Reg = AArch64::NoRegister; - // If only one of the registers of the register pair is used, make sure to - // mark the other one as used as well. - if (OddRegUsed ^ EvenRegUsed) { - // Find out which register is the additional spill. - Reg = OddRegUsed ? EvenReg : OddReg; - SavedRegs.set(Reg); + // MachO's compact unwind format relies on all registers being stored in + // pairs. + // FIXME: the usual format is actually better if unwinding isn't needed. + if (produceCompactUnwindFrame(MF) && !SavedRegs.test(PairedReg)) { + SavedRegs.set(PairedReg); + if (AArch64::GPR64RegClass.contains(PairedReg) && + !RegInfo->isReservedReg(MF, PairedReg)) + ExtraCSSpill = true; } + } - DEBUG(dbgs() << ' ' << PrintReg(OddReg, RegInfo)); - DEBUG(dbgs() << ' ' << PrintReg(EvenReg, RegInfo)); - - assert(((OddReg == AArch64::LR && EvenReg == AArch64::FP) || - (RegInfo->getEncodingValue(OddReg) + 1 == - RegInfo->getEncodingValue(EvenReg))) && - "Register pair of non-adjacent registers!"); - if (AArch64::GPR64RegClass.contains(OddReg)) { - NumGPRSpilled += 2; - // If it's not a reserved register, we can use it in lieu of an - // emergency spill slot for the register scavenger. - // FIXME: It would be better to instead keep looking and choose another - // unspilled register that isn't reserved, if there is one. - if (Reg != AArch64::NoRegister && !RegInfo->isReservedReg(MF, Reg)) - ExtraCSSpill = true; - } else - NumFPRSpilled += 2; + DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:"; + for (int Reg = SavedRegs.find_first(); Reg != -1; + Reg = SavedRegs.find_next(Reg)) + dbgs() << ' ' << PrintReg(Reg, RegInfo); + dbgs() << "\n";); - CanEliminateFrame = false; - } + // If any callee-saved registers are used, the frame cannot be eliminated. + unsigned NumRegsSpilled = SavedRegs.count(); + bool CanEliminateFrame = NumRegsSpilled == 0; // FIXME: Set BigStack if any stack slot references may be out of range. // For now, just conservatively guestimate based on unscaled indexing @@ -982,8 +1150,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // The CSR spill slots have not been allocated yet, so estimateStackSize // won't include them. MachineFrameInfo *MFI = MF.getFrameInfo(); - unsigned CFSize = - MFI->estimateStackSize(MF) + 8 * (NumGPRSpilled + NumFPRSpilled); + unsigned CFSize = MFI->estimateStackSize(MF) + 8 * NumRegsSpilled; DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n"); bool BigStack = (CFSize >= 256); if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) @@ -996,19 +1163,17 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // above to keep the number of spills even, we don't need to do anything else // here. if (BigStack && !ExtraCSSpill) { - - // If we're adding a register to spill here, we have to add two of them - // to keep the number of regs to spill even. - assert(((UnspilledCSGPRs.size() & 1) == 0) && "Odd number of registers!"); - unsigned Count = 0; - while (!UnspilledCSGPRs.empty() && Count < 2) { - unsigned Reg = UnspilledCSGPRs.back(); - UnspilledCSGPRs.pop_back(); - DEBUG(dbgs() << "Spilling " << PrintReg(Reg, RegInfo) - << " to get a scratch register.\n"); - SavedRegs.set(Reg); + if (UnspilledCSGPR != AArch64::NoRegister) { + DEBUG(dbgs() << "Spilling " << PrintReg(UnspilledCSGPR, RegInfo) + << " to get a scratch register.\n"); + SavedRegs.set(UnspilledCSGPR); + // MachO's compact unwind format relies on all registers being stored in + // pairs, so if we need to spill one extra for BigStack, then we need to + // store the pair. + if (produceCompactUnwindFrame(MF)) + SavedRegs.set(UnspilledCSGPRPaired); ExtraCSSpill = true; - ++Count; + NumRegsSpilled = SavedRegs.count(); } // If we didn't find an extra callee-saved register to spill, create @@ -1021,4 +1186,14 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, << " as the emergency spill slot.\n"); } } + + // Round up to register pair alignment to avoid additional SP adjustment + // instructions. + AFI->setCalleeSavedStackSize(alignTo(8 * NumRegsSpilled, 16)); +} + +bool AArch64FrameLowering::enableStackSlotScavenging( + const MachineFunction &MF) const { + const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + return AFI->hasCalleeSaveStackFreeSpace(); } diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h index 7d8354c38787c..f254ea9b70aa7 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.h +++ b/lib/Target/AArch64/AArch64FrameLowering.h @@ -25,12 +25,11 @@ public: true /*StackRealignable*/) {} void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - unsigned FramePtr) const; + MachineBasicBlock::iterator MBBI) const; - void eliminateCallFramePseudoInstr(MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) const override; + MachineBasicBlock::iterator + eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const override; /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. @@ -67,6 +66,12 @@ public: bool enableShrinkWrapping(const MachineFunction &MF) const override { return true; } + + bool enableStackSlotScavenging(const MachineFunction &MF) const override; + +private: + bool shouldCombineCSRLocalStackBump(MachineFunction &MF, + unsigned StackBumpBytes) const; }; } // End llvm namespace diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 6c868880bcac4..8d649250f6569 100644 --- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -57,7 +57,7 @@ public: return SelectionDAGISel::runOnMachineFunction(MF); } - SDNode *Select(SDNode *Node) override; + void Select(SDNode *Node) override; /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for /// inline asm expressions. @@ -65,8 +65,8 @@ public: unsigned ConstraintID, std::vector<SDValue> &OutOps) override; - SDNode *SelectMLAV64LaneV128(SDNode *N); - SDNode *SelectMULLV64LaneV128(unsigned IntNo, SDNode *N); + bool tryMLAV64LaneV128(SDNode *N); + bool tryMULLV64LaneV128(unsigned IntNo, SDNode *N); bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift); bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift); bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift); @@ -147,28 +147,29 @@ public: SDValue createTuple(ArrayRef<SDValue> Vecs, const unsigned RegClassIDs[], const unsigned SubRegs[]); - SDNode *SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt); + void SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt); - SDNode *SelectIndexedLoad(SDNode *N, bool &Done); + bool tryIndexedLoad(SDNode *N); - SDNode *SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc, + void SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc, unsigned SubRegIdx); - SDNode *SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc, + void SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc, unsigned SubRegIdx); - SDNode *SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); - SDNode *SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); + void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); + void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); - SDNode *SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc); - SDNode *SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc); - SDNode *SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); - SDNode *SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); + void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc); + void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc); + void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); + void SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); - SDNode *SelectBitfieldExtractOp(SDNode *N); - SDNode *SelectBitfieldInsertOp(SDNode *N); - SDNode *SelectBitfieldInsertInZeroOp(SDNode *N); + bool tryBitfieldExtractOp(SDNode *N); + bool tryBitfieldExtractOpFromSExt(SDNode *N); + bool tryBitfieldInsertOp(SDNode *N); + bool tryBitfieldInsertInZeroOp(SDNode *N); - SDNode *SelectReadRegister(SDNode *N); - SDNode *SelectWriteRegister(SDNode *N); + bool tryReadRegister(SDNode *N); + bool tryWriteRegister(SDNode *N); // Include the pieces autogenerated from the target description. #include "AArch64GenDAGISel.inc" @@ -198,6 +199,9 @@ private: } bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width); + + void SelectCMP_SWAP(SDNode *N); + }; } // end anonymous namespace @@ -328,9 +332,7 @@ static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) { bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const { // it hurts if the value is used at least twice, unless we are optimizing // for code size. - if (ForCodeSize || V.hasOneUse()) - return true; - return false; + return ForCodeSize || V.hasOneUse(); } /// SelectShiftedRegister - Select a "shifted register" operand. If the value @@ -452,7 +454,7 @@ static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp, /// SelectMLAV64LaneV128 - AArch64 supports vector MLAs where one multiplicand /// is a lane in the upper half of a 128-bit vector. Recognize and select this /// so that we don't emit unnecessary lane extracts. -SDNode *AArch64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) { +bool AArch64DAGToDAGISel::tryMLAV64LaneV128(SDNode *N) { SDLoc dl(N); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); @@ -467,7 +469,7 @@ SDNode *AArch64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) { if (Op1.getOpcode() != ISD::MUL || !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2, LaneIdx)) - return nullptr; + return false; } SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64); @@ -493,10 +495,11 @@ SDNode *AArch64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) { break; } - return CurDAG->getMachineNode(MLAOpc, dl, N->getValueType(0), Ops); + ReplaceNode(N, CurDAG->getMachineNode(MLAOpc, dl, N->getValueType(0), Ops)); + return true; } -SDNode *AArch64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) { +bool AArch64DAGToDAGISel::tryMULLV64LaneV128(unsigned IntNo, SDNode *N) { SDLoc dl(N); SDValue SMULLOp0; SDValue SMULLOp1; @@ -504,7 +507,7 @@ SDNode *AArch64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) { if (!checkV64LaneV128(N->getOperand(1), N->getOperand(2), SMULLOp0, SMULLOp1, LaneIdx)) - return nullptr; + return false; SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64); @@ -537,7 +540,8 @@ SDNode *AArch64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) { } else llvm_unreachable("Unrecognized intrinsic."); - return CurDAG->getMachineNode(SMULLOpc, dl, N->getValueType(0), Ops); + ReplaceNode(N, CurDAG->getMachineNode(SMULLOpc, dl, N->getValueType(0), Ops)); + return true; } /// Instructions that accept extend modifiers like UXTW expect the register @@ -610,7 +614,7 @@ static bool isWorthFoldingADDlow(SDValue N) { // ldar and stlr have much more restrictive addressing modes (just a // register). - if (cast<MemSDNode>(Use)->getOrdering() > Monotonic) + if (isStrongerThanMonotonic(cast<MemSDNode>(Use)->getOrdering())) return false; } @@ -687,7 +691,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size, const GlobalValue *GV = GAN->getGlobal(); unsigned Alignment = GV->getAlignment(); - Type *Ty = GV->getType()->getElementType(); + Type *Ty = GV->getValueType(); if (Alignment == 0 && Ty->isSized()) Alignment = DL.getABITypeAlignment(Ty); @@ -797,10 +801,7 @@ bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size, if (ShiftVal != 0 && ShiftVal != LegalShiftVal) return false; - if (isWorthFolding(N)) - return true; - - return false; + return isWorthFolding(N); } bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size, @@ -1015,8 +1016,8 @@ SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs, return SDValue(N, 0); } -SDNode *AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs, - unsigned Opc, bool isExt) { +void AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, + bool isExt) { SDLoc dl(N); EVT VT = N->getValueType(0); @@ -1033,13 +1034,13 @@ SDNode *AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs, Ops.push_back(N->getOperand(1)); Ops.push_back(RegSeq); Ops.push_back(N->getOperand(NumVecs + ExtOff + 1)); - return CurDAG->getMachineNode(Opc, dl, VT, Ops); + ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops)); } -SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) { +bool AArch64DAGToDAGISel::tryIndexedLoad(SDNode *N) { LoadSDNode *LD = cast<LoadSDNode>(N); if (LD->isUnindexed()) - return nullptr; + return false; EVT VT = LD->getMemoryVT(); EVT DstVT = N->getValueType(0); ISD::MemIndexedMode AM = LD->getAddressingMode(); @@ -1101,7 +1102,7 @@ SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) { } else if (VT.is128BitVector()) { Opcode = IsPre ? AArch64::LDRQpre : AArch64::LDRQpost; } else - return nullptr; + return false; SDValue Chain = LD->getChain(); SDValue Base = LD->getBasePtr(); ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset()); @@ -1112,7 +1113,6 @@ SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) { SDNode *Res = CurDAG->getMachineNode(Opcode, dl, MVT::i64, DstVT, MVT::Other, Ops); // Either way, we're replacing the node, so tell the caller that. - Done = true; SDValue LoadedVal = SDValue(Res, 1); if (InsertTo64) { SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32); @@ -1127,12 +1127,12 @@ SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) { ReplaceUses(SDValue(N, 0), LoadedVal); ReplaceUses(SDValue(N, 1), SDValue(Res, 0)); ReplaceUses(SDValue(N, 2), SDValue(Res, 2)); - - return nullptr; + CurDAG->RemoveDeadNode(N); + return true; } -SDNode *AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, - unsigned Opc, unsigned SubRegIdx) { +void AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc, + unsigned SubRegIdx) { SDLoc dl(N); EVT VT = N->getValueType(0); SDValue Chain = N->getOperand(0); @@ -1149,11 +1149,11 @@ SDNode *AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg)); ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1)); - return nullptr; + CurDAG->RemoveDeadNode(N); } -SDNode *AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs, - unsigned Opc, unsigned SubRegIdx) { +void AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs, + unsigned Opc, unsigned SubRegIdx) { SDLoc dl(N); EVT VT = N->getValueType(0); SDValue Chain = N->getOperand(0); @@ -1181,11 +1181,11 @@ SDNode *AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs, // Update the chain ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2)); - return nullptr; + CurDAG->RemoveDeadNode(N); } -SDNode *AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs, - unsigned Opc) { +void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs, + unsigned Opc) { SDLoc dl(N); EVT VT = N->getOperand(2)->getValueType(0); @@ -1197,11 +1197,11 @@ SDNode *AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs, SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), N->getOperand(0)}; SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops); - return St; + ReplaceNode(N, St); } -SDNode *AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs, - unsigned Opc) { +void AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs, + unsigned Opc) { SDLoc dl(N); EVT VT = N->getOperand(2)->getValueType(0); const EVT ResTys[] = {MVT::i64, // Type of the write back register @@ -1218,7 +1218,7 @@ SDNode *AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs, N->getOperand(0)}; // Chain SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); - return St; + ReplaceNode(N, St); } namespace { @@ -1256,8 +1256,8 @@ static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) { V128Reg); } -SDNode *AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs, - unsigned Opc) { +void AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs, + unsigned Opc) { SDLoc dl(N); EVT VT = N->getValueType(0); bool Narrow = VT.getSizeInBits() == 64; @@ -1292,12 +1292,11 @@ SDNode *AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs, } ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1)); - - return Ld; + CurDAG->RemoveDeadNode(N); } -SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs, - unsigned Opc) { +void AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs, + unsigned Opc) { SDLoc dl(N); EVT VT = N->getValueType(0); bool Narrow = VT.getSizeInBits() == 64; @@ -1348,12 +1347,11 @@ SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs, // Update the Chain ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2)); - - return Ld; + CurDAG->RemoveDeadNode(N); } -SDNode *AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs, - unsigned Opc) { +void AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs, + unsigned Opc) { SDLoc dl(N); EVT VT = N->getOperand(2)->getValueType(0); bool Narrow = VT.getSizeInBits() == 64; @@ -1379,11 +1377,11 @@ SDNode *AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs, MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1); - return St; + ReplaceNode(N, St); } -SDNode *AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs, - unsigned Opc) { +void AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs, + unsigned Opc) { SDLoc dl(N); EVT VT = N->getOperand(2)->getValueType(0); bool Narrow = VT.getSizeInBits() == 64; @@ -1414,7 +1412,7 @@ SDNode *AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs, MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1); - return St; + ReplaceNode(N, St); } static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N, @@ -1441,25 +1439,25 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N, // form these situations when matching bigger pattern (bitfield insert). // For unsigned extracts, check for a shift right and mask - uint64_t And_imm = 0; - if (!isOpcWithIntImmediate(N, ISD::AND, And_imm)) + uint64_t AndImm = 0; + if (!isOpcWithIntImmediate(N, ISD::AND, AndImm)) return false; const SDNode *Op0 = N->getOperand(0).getNode(); // Because of simplify-demanded-bits in DAGCombine, the mask may have been // simplified. Try to undo that - And_imm |= (1 << NumberOfIgnoredLowBits) - 1; + AndImm |= (1 << NumberOfIgnoredLowBits) - 1; // The immediate is a mask of the low bits iff imm & (imm+1) == 0 - if (And_imm & (And_imm + 1)) + if (AndImm & (AndImm + 1)) return false; bool ClampMSB = false; - uint64_t Srl_imm = 0; + uint64_t SrlImm = 0; // Handle the SRL + ANY_EXTEND case. if (VT == MVT::i64 && Op0->getOpcode() == ISD::ANY_EXTEND && - isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, Srl_imm)) { + isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, SrlImm)) { // Extend the incoming operand of the SRL to 64-bit. Opd0 = Widen(CurDAG, Op0->getOperand(0).getOperand(0)); // Make sure to clamp the MSB so that we preserve the semantics of the @@ -1467,13 +1465,13 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N, ClampMSB = true; } else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE && isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, - Srl_imm)) { + SrlImm)) { // If the shift result was truncated, we can still combine them. Opd0 = Op0->getOperand(0).getOperand(0); // Use the type of SRL node. VT = Opd0->getValueType(0); - } else if (isOpcWithIntImmediate(Op0, ISD::SRL, Srl_imm)) { + } else if (isOpcWithIntImmediate(Op0, ISD::SRL, SrlImm)) { Opd0 = Op0->getOperand(0); } else if (BiggerPattern) { // Let's pretend a 0 shift right has been performed. @@ -1487,15 +1485,15 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N, // Bail out on large immediates. This happens when no proper // combining/constant folding was performed. - if (!BiggerPattern && (Srl_imm <= 0 || Srl_imm >= VT.getSizeInBits())) { + if (!BiggerPattern && (SrlImm <= 0 || SrlImm >= VT.getSizeInBits())) { DEBUG((dbgs() << N << ": Found large shift immediate, this should not happen\n")); return false; } - LSB = Srl_imm; - MSB = Srl_imm + (VT == MVT::i32 ? countTrailingOnes<uint32_t>(And_imm) - : countTrailingOnes<uint64_t>(And_imm)) - + LSB = SrlImm; + MSB = SrlImm + (VT == MVT::i32 ? countTrailingOnes<uint32_t>(AndImm) + : countTrailingOnes<uint64_t>(AndImm)) - 1; if (ClampMSB) // Since we're moving the extend before the right shift operation, we need @@ -1508,6 +1506,39 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N, return true; } +static bool isBitfieldExtractOpFromSExtInReg(SDNode *N, unsigned &Opc, + SDValue &Opd0, unsigned &Immr, + unsigned &Imms) { + assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG); + + EVT VT = N->getValueType(0); + unsigned BitWidth = VT.getSizeInBits(); + assert((VT == MVT::i32 || VT == MVT::i64) && + "Type checking must have been done before calling this function"); + + SDValue Op = N->getOperand(0); + if (Op->getOpcode() == ISD::TRUNCATE) { + Op = Op->getOperand(0); + VT = Op->getValueType(0); + BitWidth = VT.getSizeInBits(); + } + + uint64_t ShiftImm; + if (!isOpcWithIntImmediate(Op.getNode(), ISD::SRL, ShiftImm) && + !isOpcWithIntImmediate(Op.getNode(), ISD::SRA, ShiftImm)) + return false; + + unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits(); + if (ShiftImm + Width > BitWidth) + return false; + + Opc = (VT == MVT::i32) ? AArch64::SBFMWri : AArch64::SBFMXri; + Opd0 = Op.getOperand(0); + Immr = ShiftImm; + Imms = ShiftImm + Width - 1; + return true; +} + static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0, unsigned &LSB, unsigned &MSB) { @@ -1522,32 +1553,32 @@ static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc, // // This gets selected into a single UBFM: // - // UBFM Value, ShiftImm, BitWide + Srl_imm -1 + // UBFM Value, ShiftImm, BitWide + SrlImm -1 // if (N->getOpcode() != ISD::SRL) return false; - uint64_t And_mask = 0; - if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, And_mask)) + uint64_t AndMask = 0; + if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, AndMask)) return false; Opd0 = N->getOperand(0).getOperand(0); - uint64_t Srl_imm = 0; - if (!isIntImmediate(N->getOperand(1), Srl_imm)) + uint64_t SrlImm = 0; + if (!isIntImmediate(N->getOperand(1), SrlImm)) return false; // Check whether we really have several bits extract here. - unsigned BitWide = 64 - countLeadingOnes(~(And_mask >> Srl_imm)); - if (BitWide && isMask_64(And_mask >> Srl_imm)) { + unsigned BitWide = 64 - countLeadingOnes(~(AndMask >> SrlImm)); + if (BitWide && isMask_64(AndMask >> SrlImm)) { if (N->getValueType(0) == MVT::i32) Opc = AArch64::UBFMWri; else Opc = AArch64::UBFMXri; - LSB = Srl_imm; - MSB = BitWide + Srl_imm - 1; + LSB = SrlImm; + MSB = BitWide + SrlImm - 1; return true; } @@ -1572,10 +1603,10 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0, if (isSeveralBitsExtractOpFromShr(N, Opc, Opd0, Immr, Imms)) return true; - // we're looking for a shift of a shift - uint64_t Shl_imm = 0; - uint64_t Trunc_bits = 0; - if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) { + // We're looking for a shift of a shift. + uint64_t ShlImm = 0; + uint64_t TruncBits = 0; + if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, ShlImm)) { Opd0 = N->getOperand(0).getOperand(0); } else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL && N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) { @@ -1584,7 +1615,7 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0, // always generate 64bit UBFM. This consistency will help the CSE pass // later find more redundancy. Opd0 = N->getOperand(0).getOperand(0); - Trunc_bits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits(); + TruncBits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits(); VT = Opd0->getValueType(0); assert(VT == MVT::i64 && "the promoted type should be i64"); } else if (BiggerPattern) { @@ -1597,21 +1628,21 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0, // Missing combines/constant folding may have left us with strange // constants. - if (Shl_imm >= VT.getSizeInBits()) { + if (ShlImm >= VT.getSizeInBits()) { DEBUG((dbgs() << N << ": Found large shift immediate, this should not happen\n")); return false; } - uint64_t Srl_imm = 0; - if (!isIntImmediate(N->getOperand(1), Srl_imm)) + uint64_t SrlImm = 0; + if (!isIntImmediate(N->getOperand(1), SrlImm)) return false; - assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() && + assert(SrlImm > 0 && SrlImm < VT.getSizeInBits() && "bad amount in shift node!"); - int immr = Srl_imm - Shl_imm; + int immr = SrlImm - ShlImm; Immr = immr < 0 ? immr + VT.getSizeInBits() : immr; - Imms = VT.getSizeInBits() - Shl_imm - Trunc_bits - 1; + Imms = VT.getSizeInBits() - ShlImm - TruncBits - 1; // SRA requires a signed extraction if (VT == MVT::i32) Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMWri : AArch64::UBFMWri; @@ -1620,6 +1651,30 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0, return true; } +bool AArch64DAGToDAGISel::tryBitfieldExtractOpFromSExt(SDNode *N) { + assert(N->getOpcode() == ISD::SIGN_EXTEND); + + EVT VT = N->getValueType(0); + EVT NarrowVT = N->getOperand(0)->getValueType(0); + if (VT != MVT::i64 || NarrowVT != MVT::i32) + return false; + + uint64_t ShiftImm; + SDValue Op = N->getOperand(0); + if (!isOpcWithIntImmediate(Op.getNode(), ISD::SRA, ShiftImm)) + return false; + + SDLoc dl(N); + // Extend the incoming operand of the shift to 64-bits. + SDValue Opd0 = Widen(CurDAG, Op.getOperand(0)); + unsigned Immr = ShiftImm; + unsigned Imms = NarrowVT.getSizeInBits() - 1; + SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT), + CurDAG->getTargetConstant(Imms, dl, VT)}; + CurDAG->SelectNodeTo(N, AArch64::SBFMXri, VT, Ops); + return true; +} + static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc, SDValue &Opd0, unsigned &Immr, unsigned &Imms, unsigned NumberOfIgnoredLowBits = 0, @@ -1638,6 +1693,9 @@ static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc, case ISD::SRL: case ISD::SRA: return isBitfieldExtractOpFromShr(N, Opc, Opd0, Immr, Imms, BiggerPattern); + + case ISD::SIGN_EXTEND_INREG: + return isBitfieldExtractOpFromSExtInReg(N, Opc, Opd0, Immr, Imms); } unsigned NOpc = N->getMachineOpcode(); @@ -1658,11 +1716,11 @@ static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc, return false; } -SDNode *AArch64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) { +bool AArch64DAGToDAGISel::tryBitfieldExtractOp(SDNode *N) { unsigned Opc, Immr, Imms; SDValue Opd0; if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, Immr, Imms)) - return nullptr; + return false; EVT VT = N->getValueType(0); SDLoc dl(N); @@ -1675,22 +1733,22 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) { SDNode *BFM = CurDAG->getMachineNode(Opc, dl, MVT::i64, Ops64); SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32); - MachineSDNode *Node = - CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::i32, - SDValue(BFM, 0), SubReg); - return Node; + ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, + MVT::i32, SDValue(BFM, 0), SubReg)); + return true; } SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT), CurDAG->getTargetConstant(Imms, dl, VT)}; - return CurDAG->SelectNodeTo(N, Opc, VT, Ops); + CurDAG->SelectNodeTo(N, Opc, VT, Ops); + return true; } /// Does DstMask form a complementary pair with the mask provided by /// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking, /// this asks whether DstMask zeroes precisely those bits that will be set by /// the other half. -static bool isBitfieldDstMask(uint64_t DstMask, APInt BitsToBeInserted, +static bool isBitfieldDstMask(uint64_t DstMask, const APInt &BitsToBeInserted, unsigned NumberOfIgnoredHighBits, EVT VT) { assert((VT == MVT::i32 || VT == MVT::i64) && "i32 or i64 mask type expected!"); @@ -1851,6 +1909,20 @@ static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits, case AArch64::BFMWri: case AArch64::BFMXri: return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth); + + case AArch64::STRBBui: + case AArch64::STURBBi: + if (UserNode->getOperand(0) != Orig) + return; + UsefulBits &= APInt(UsefulBits.getBitWidth(), 0xff); + return; + + case AArch64::STRHHui: + case AArch64::STURHHi: + if (UserNode->getOperand(0) != Orig) + return; + UsefulBits &= APInt(UsefulBits.getBitWidth(), 0xffff); + return; } } @@ -1963,36 +2035,129 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op, return true; } -// Given a OR operation, check if we have the following pattern -// ubfm c, b, imm, imm2 (or something that does the same jobs, see -// isBitfieldExtractOp) -// d = e & mask2 ; where mask is a binary sequence of 1..10..0 and -// countTrailingZeros(mask2) == imm2 - imm + 1 -// f = d | c -// if yes, given reference arguments will be update so that one can replace -// the OR instruction with: -// f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2 -static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst, - SDValue &Src, unsigned &ImmR, - unsigned &ImmS, const APInt &UsefulBits, - SelectionDAG *CurDAG) { +static bool isShiftedMask(uint64_t Mask, EVT VT) { + assert(VT == MVT::i32 || VT == MVT::i64); + if (VT == MVT::i32) + return isShiftedMask_32(Mask); + return isShiftedMask_64(Mask); +} + +// Generate a BFI/BFXIL from 'or (and X, MaskImm), OrImm' iff the value being +// inserted only sets known zero bits. +static bool tryBitfieldInsertOpFromOrAndImm(SDNode *N, SelectionDAG *CurDAG) { assert(N->getOpcode() == ISD::OR && "Expect a OR operation"); - // Set Opc EVT VT = N->getValueType(0); - if (VT == MVT::i32) - Opc = AArch64::BFMWri; - else if (VT == MVT::i64) - Opc = AArch64::BFMXri; - else + if (VT != MVT::i32 && VT != MVT::i64) + return false; + + unsigned BitWidth = VT.getSizeInBits(); + + uint64_t OrImm; + if (!isOpcWithIntImmediate(N, ISD::OR, OrImm)) + return false; + + // Skip this transformation if the ORR immediate can be encoded in the ORR. + // Otherwise, we'll trade an AND+ORR for ORR+BFI/BFXIL, which is most likely + // performance neutral. + if (AArch64_AM::isLogicalImmediate(OrImm, BitWidth)) return false; + uint64_t MaskImm; + SDValue And = N->getOperand(0); + // Must be a single use AND with an immediate operand. + if (!And.hasOneUse() || + !isOpcWithIntImmediate(And.getNode(), ISD::AND, MaskImm)) + return false; + + // Compute the Known Zero for the AND as this allows us to catch more general + // cases than just looking for AND with imm. + APInt KnownZero, KnownOne; + CurDAG->computeKnownBits(And, KnownZero, KnownOne); + + // Non-zero in the sense that they're not provably zero, which is the key + // point if we want to use this value. + uint64_t NotKnownZero = (~KnownZero).getZExtValue(); + + // The KnownZero mask must be a shifted mask (e.g., 1110..011, 11100..00). + if (!isShiftedMask(KnownZero.getZExtValue(), VT)) + return false; + + // The bits being inserted must only set those bits that are known to be zero. + if ((OrImm & NotKnownZero) != 0) { + // FIXME: It's okay if the OrImm sets NotKnownZero bits to 1, but we don't + // currently handle this case. + return false; + } + + // BFI/BFXIL dst, src, #lsb, #width. + int LSB = countTrailingOnes(NotKnownZero); + int Width = BitWidth - APInt(BitWidth, NotKnownZero).countPopulation(); + + // BFI/BFXIL is an alias of BFM, so translate to BFM operands. + unsigned ImmR = (BitWidth - LSB) % BitWidth; + unsigned ImmS = Width - 1; + + // If we're creating a BFI instruction avoid cases where we need more + // instructions to materialize the BFI constant as compared to the original + // ORR. A BFXIL will use the same constant as the original ORR, so the code + // should be no worse in this case. + bool IsBFI = LSB != 0; + uint64_t BFIImm = OrImm >> LSB; + if (IsBFI && !AArch64_AM::isLogicalImmediate(BFIImm, BitWidth)) { + // We have a BFI instruction and we know the constant can't be materialized + // with a ORR-immediate with the zero register. + unsigned OrChunks = 0, BFIChunks = 0; + for (unsigned Shift = 0; Shift < BitWidth; Shift += 16) { + if (((OrImm >> Shift) & 0xFFFF) != 0) + ++OrChunks; + if (((BFIImm >> Shift) & 0xFFFF) != 0) + ++BFIChunks; + } + if (BFIChunks > OrChunks) + return false; + } + + // Materialize the constant to be inserted. + SDLoc DL(N); + unsigned MOVIOpc = VT == MVT::i32 ? AArch64::MOVi32imm : AArch64::MOVi64imm; + SDNode *MOVI = CurDAG->getMachineNode( + MOVIOpc, DL, VT, CurDAG->getTargetConstant(BFIImm, DL, VT)); + + // Create the BFI/BFXIL instruction. + SDValue Ops[] = {And.getOperand(0), SDValue(MOVI, 0), + CurDAG->getTargetConstant(ImmR, DL, VT), + CurDAG->getTargetConstant(ImmS, DL, VT)}; + unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri; + CurDAG->SelectNodeTo(N, Opc, VT, Ops); + return true; +} + +static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits, + SelectionDAG *CurDAG) { + assert(N->getOpcode() == ISD::OR && "Expect a OR operation"); + + EVT VT = N->getValueType(0); + if (VT != MVT::i32 && VT != MVT::i64) + return false; + + unsigned BitWidth = VT.getSizeInBits(); + // Because of simplify-demanded-bits in DAGCombine, involved masks may not // have the expected shape. Try to undo that. unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros(); unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros(); + // Given a OR operation, check if we have the following pattern + // ubfm c, b, imm, imm2 (or something that does the same jobs, see + // isBitfieldExtractOp) + // d = e & mask2 ; where mask is a binary sequence of 1..10..0 and + // countTrailingZeros(mask2) == imm2 - imm + 1 + // f = d | c + // if yes, replace the OR instruction with: + // f = BFM Opd0, Opd1, LSB, MSB ; where LSB = imm, and MSB = imm2 + // OR is commutative, check all combinations of operand order and values of // BiggerPattern, i.e. // Opd0, Opd1, BiggerPattern=false @@ -2004,8 +2169,11 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst, // and/or inserting fewer extra instructions. for (int I = 0; I < 4; ++I) { + SDValue Dst, Src; + unsigned ImmR, ImmS; bool BiggerPattern = I / 2; - SDNode *OrOpd0 = N->getOperand(I % 2).getNode(); + SDValue OrOpd0Val = N->getOperand(I % 2); + SDNode *OrOpd0 = OrOpd0Val.getNode(); SDValue OrOpd1Val = N->getOperand((I + 1) % 2); SDNode *OrOpd1 = OrOpd1Val.getNode(); @@ -2030,10 +2198,10 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst, // If the mask on the insertee is correct, we have a BFXIL operation. We // can share the ImmR and ImmS values from the already-computed UBFM. - } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0), + } else if (isBitfieldPositioningOp(CurDAG, OrOpd0Val, BiggerPattern, Src, DstLSB, Width)) { - ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits(); + ImmR = (BitWidth - DstLSB) % BitWidth; ImmS = Width - 1; } else continue; @@ -2069,60 +2237,98 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst, Dst = OrOpd1Val; // both parts match + SDLoc DL(N); + SDValue Ops[] = {Dst, Src, CurDAG->getTargetConstant(ImmR, DL, VT), + CurDAG->getTargetConstant(ImmS, DL, VT)}; + unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri; + CurDAG->SelectNodeTo(N, Opc, VT, Ops); + return true; + } + + // Generate a BFXIL from 'or (and X, Mask0Imm), (and Y, Mask1Imm)' iff + // Mask0Imm and ~Mask1Imm are equivalent and one of the MaskImms is a shifted + // mask (e.g., 0x000ffff0). + uint64_t Mask0Imm, Mask1Imm; + SDValue And0 = N->getOperand(0); + SDValue And1 = N->getOperand(1); + if (And0.hasOneUse() && And1.hasOneUse() && + isOpcWithIntImmediate(And0.getNode(), ISD::AND, Mask0Imm) && + isOpcWithIntImmediate(And1.getNode(), ISD::AND, Mask1Imm) && + APInt(BitWidth, Mask0Imm) == ~APInt(BitWidth, Mask1Imm) && + (isShiftedMask(Mask0Imm, VT) || isShiftedMask(Mask1Imm, VT))) { + + // ORR is commutative, so canonicalize to the form 'or (and X, Mask0Imm), + // (and Y, Mask1Imm)' where Mask1Imm is the shifted mask masking off the + // bits to be inserted. + if (isShiftedMask(Mask0Imm, VT)) { + std::swap(And0, And1); + std::swap(Mask0Imm, Mask1Imm); + } + + SDValue Src = And1->getOperand(0); + SDValue Dst = And0->getOperand(0); + unsigned LSB = countTrailingZeros(Mask1Imm); + int Width = BitWidth - APInt(BitWidth, Mask0Imm).countPopulation(); + + // The BFXIL inserts the low-order bits from a source register, so right + // shift the needed bits into place. + SDLoc DL(N); + unsigned ShiftOpc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri; + SDNode *LSR = CurDAG->getMachineNode( + ShiftOpc, DL, VT, Src, CurDAG->getTargetConstant(LSB, DL, VT), + CurDAG->getTargetConstant(BitWidth - 1, DL, VT)); + + // BFXIL is an alias of BFM, so translate to BFM operands. + unsigned ImmR = (BitWidth - LSB) % BitWidth; + unsigned ImmS = Width - 1; + + // Create the BFXIL instruction. + SDValue Ops[] = {Dst, SDValue(LSR, 0), + CurDAG->getTargetConstant(ImmR, DL, VT), + CurDAG->getTargetConstant(ImmS, DL, VT)}; + unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri; + CurDAG->SelectNodeTo(N, Opc, VT, Ops); return true; } return false; } -SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) { +bool AArch64DAGToDAGISel::tryBitfieldInsertOp(SDNode *N) { if (N->getOpcode() != ISD::OR) - return nullptr; + return false; - unsigned Opc; - unsigned LSB, MSB; - SDValue Opd0, Opd1; - EVT VT = N->getValueType(0); APInt NUsefulBits; getUsefulBits(SDValue(N, 0), NUsefulBits); // If all bits are not useful, just return UNDEF. - if (!NUsefulBits) - return CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, VT); + if (!NUsefulBits) { + CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, N->getValueType(0)); + return true; + } - if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, NUsefulBits, - CurDAG)) - return nullptr; + if (tryBitfieldInsertOpFromOr(N, NUsefulBits, CurDAG)) + return true; - SDLoc dl(N); - SDValue Ops[] = { Opd0, - Opd1, - CurDAG->getTargetConstant(LSB, dl, VT), - CurDAG->getTargetConstant(MSB, dl, VT) }; - return CurDAG->SelectNodeTo(N, Opc, VT, Ops); + return tryBitfieldInsertOpFromOrAndImm(N, CurDAG); } /// SelectBitfieldInsertInZeroOp - Match a UBFIZ instruction that is the /// equivalent of a left shift by a constant amount followed by an and masking /// out a contiguous set of bits. -SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertInZeroOp(SDNode *N) { +bool AArch64DAGToDAGISel::tryBitfieldInsertInZeroOp(SDNode *N) { if (N->getOpcode() != ISD::AND) - return nullptr; + return false; EVT VT = N->getValueType(0); - unsigned Opc; - if (VT == MVT::i32) - Opc = AArch64::UBFMWri; - else if (VT == MVT::i64) - Opc = AArch64::UBFMXri; - else - return nullptr; + if (VT != MVT::i32 && VT != MVT::i64) + return false; SDValue Op0; int DstLSB, Width; if (!isBitfieldPositioningOp(CurDAG, SDValue(N, 0), /*BiggerPattern=*/false, Op0, DstLSB, Width)) - return nullptr; + return false; // ImmR is the rotate right amount. unsigned ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits(); @@ -2132,7 +2338,9 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertInZeroOp(SDNode *N) { SDLoc DL(N); SDValue Ops[] = {Op0, CurDAG->getTargetConstant(ImmR, DL, VT), CurDAG->getTargetConstant(ImmS, DL, VT)}; - return CurDAG->SelectNodeTo(N, Opc, VT, Ops); + unsigned Opc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri; + CurDAG->SelectNodeTo(N, Opc, VT, Ops); + return true; } bool @@ -2214,62 +2422,68 @@ static int getIntOperandFromRegisterString(StringRef RegString) { // register string argument is either of the form detailed in the ALCE (the // form described in getIntOperandsFromRegsterString) or is a named register // known by the MRS SysReg mapper. -SDNode *AArch64DAGToDAGISel::SelectReadRegister(SDNode *N) { +bool AArch64DAGToDAGISel::tryReadRegister(SDNode *N) { const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1)); const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0)); SDLoc DL(N); int Reg = getIntOperandFromRegisterString(RegString->getString()); - if (Reg != -1) - return CurDAG->getMachineNode(AArch64::MRS, DL, N->getSimpleValueType(0), - MVT::Other, - CurDAG->getTargetConstant(Reg, DL, MVT::i32), - N->getOperand(0)); + if (Reg != -1) { + ReplaceNode(N, CurDAG->getMachineNode( + AArch64::MRS, DL, N->getSimpleValueType(0), MVT::Other, + CurDAG->getTargetConstant(Reg, DL, MVT::i32), + N->getOperand(0))); + return true; + } // Use the sysreg mapper to map the remaining possible strings to the // value for the register to be used for the instruction operand. - AArch64SysReg::MRSMapper mapper; - bool IsValidSpecialReg; - Reg = mapper.fromString(RegString->getString(), - Subtarget->getFeatureBits(), - IsValidSpecialReg); - if (IsValidSpecialReg) - return CurDAG->getMachineNode(AArch64::MRS, DL, N->getSimpleValueType(0), - MVT::Other, - CurDAG->getTargetConstant(Reg, DL, MVT::i32), - N->getOperand(0)); + auto TheReg = AArch64SysReg::lookupSysRegByName(RegString->getString()); + if (TheReg && TheReg->Readable && + TheReg->haveFeatures(Subtarget->getFeatureBits())) + Reg = TheReg->Encoding; + else + Reg = AArch64SysReg::parseGenericRegister(RegString->getString()); + + if (Reg != -1) { + ReplaceNode(N, CurDAG->getMachineNode( + AArch64::MRS, DL, N->getSimpleValueType(0), MVT::Other, + CurDAG->getTargetConstant(Reg, DL, MVT::i32), + N->getOperand(0))); + return true; + } - return nullptr; + return false; } // Lower the write_register intrinsic to an MSR instruction node if the special // register string argument is either of the form detailed in the ALCE (the // form described in getIntOperandsFromRegsterString) or is a named register // known by the MSR SysReg mapper. -SDNode *AArch64DAGToDAGISel::SelectWriteRegister(SDNode *N) { +bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) { const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1)); const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0)); SDLoc DL(N); int Reg = getIntOperandFromRegisterString(RegString->getString()); - if (Reg != -1) - return CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other, + if (Reg != -1) { + ReplaceNode( + N, CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other, CurDAG->getTargetConstant(Reg, DL, MVT::i32), - N->getOperand(2), N->getOperand(0)); + N->getOperand(2), N->getOperand(0))); + return true; + } // Check if the register was one of those allowed as the pstatefield value in // the MSR (immediate) instruction. To accept the values allowed in the // pstatefield for the MSR (immediate) instruction, we also require that an // immediate value has been provided as an argument, we know that this is // the case as it has been ensured by semantic checking. - AArch64PState::PStateMapper PMapper; - bool IsValidSpecialReg; - Reg = PMapper.fromString(RegString->getString(), - Subtarget->getFeatureBits(), - IsValidSpecialReg); - if (IsValidSpecialReg) { + auto PMapper = AArch64PState::lookupPStateByName(RegString->getString());; + if (PMapper) { assert (isa<ConstantSDNode>(N->getOperand(2)) && "Expected a constant integer expression."); + unsigned Reg = PMapper->Encoding; uint64_t Immed = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); unsigned State; if (Reg == AArch64PState::PAN || Reg == AArch64PState::UAO) { @@ -2279,29 +2493,66 @@ SDNode *AArch64DAGToDAGISel::SelectWriteRegister(SDNode *N) { assert(Immed < 16 && "Bad imm"); State = AArch64::MSRpstateImm4; } - return CurDAG->getMachineNode(State, DL, MVT::Other, - CurDAG->getTargetConstant(Reg, DL, MVT::i32), - CurDAG->getTargetConstant(Immed, DL, MVT::i16), - N->getOperand(0)); + ReplaceNode(N, CurDAG->getMachineNode( + State, DL, MVT::Other, + CurDAG->getTargetConstant(Reg, DL, MVT::i32), + CurDAG->getTargetConstant(Immed, DL, MVT::i16), + N->getOperand(0))); + return true; } // Use the sysreg mapper to attempt to map the remaining possible strings // to the value for the register to be used for the MSR (register) // instruction operand. - AArch64SysReg::MSRMapper Mapper; - Reg = Mapper.fromString(RegString->getString(), - Subtarget->getFeatureBits(), - IsValidSpecialReg); + auto TheReg = AArch64SysReg::lookupSysRegByName(RegString->getString()); + if (TheReg && TheReg->Writeable && + TheReg->haveFeatures(Subtarget->getFeatureBits())) + Reg = TheReg->Encoding; + else + Reg = AArch64SysReg::parseGenericRegister(RegString->getString()); + if (Reg != -1) { + ReplaceNode(N, CurDAG->getMachineNode( + AArch64::MSR, DL, MVT::Other, + CurDAG->getTargetConstant(Reg, DL, MVT::i32), + N->getOperand(2), N->getOperand(0))); + return true; + } - if (IsValidSpecialReg) - return CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other, - CurDAG->getTargetConstant(Reg, DL, MVT::i32), - N->getOperand(2), N->getOperand(0)); + return false; +} + +/// We've got special pseudo-instructions for these +void AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) { + unsigned Opcode; + EVT MemTy = cast<MemSDNode>(N)->getMemoryVT(); + if (MemTy == MVT::i8) + Opcode = AArch64::CMP_SWAP_8; + else if (MemTy == MVT::i16) + Opcode = AArch64::CMP_SWAP_16; + else if (MemTy == MVT::i32) + Opcode = AArch64::CMP_SWAP_32; + else if (MemTy == MVT::i64) + Opcode = AArch64::CMP_SWAP_64; + else + llvm_unreachable("Unknown AtomicCmpSwap type"); - return nullptr; + MVT RegTy = MemTy == MVT::i64 ? MVT::i64 : MVT::i32; + SDValue Ops[] = {N->getOperand(1), N->getOperand(2), N->getOperand(3), + N->getOperand(0)}; + SDNode *CmpSwap = CurDAG->getMachineNode( + Opcode, SDLoc(N), + CurDAG->getVTList(RegTy, MVT::i32, MVT::Other), Ops); + + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemSDNode>(N)->getMemOperand(); + cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1); + + ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0)); + ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2)); + CurDAG->RemoveDeadNode(N); } -SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { +void AArch64DAGToDAGISel::Select(SDNode *Node) { // Dump information about the Node being selected DEBUG(errs() << "Selecting: "); DEBUG(Node->dump(CurDAG)); @@ -2311,54 +2562,61 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { if (Node->isMachineOpcode()) { DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n"); Node->setNodeId(-1); - return nullptr; + return; } // Few custom selection stuff. - SDNode *ResNode = nullptr; EVT VT = Node->getValueType(0); switch (Node->getOpcode()) { default: break; + case ISD::ATOMIC_CMP_SWAP: + SelectCMP_SWAP(Node); + return; + case ISD::READ_REGISTER: - if (SDNode *Res = SelectReadRegister(Node)) - return Res; + if (tryReadRegister(Node)) + return; break; case ISD::WRITE_REGISTER: - if (SDNode *Res = SelectWriteRegister(Node)) - return Res; + if (tryWriteRegister(Node)) + return; break; case ISD::ADD: - if (SDNode *I = SelectMLAV64LaneV128(Node)) - return I; + if (tryMLAV64LaneV128(Node)) + return; break; case ISD::LOAD: { // Try to select as an indexed load. Fall through to normal processing // if we can't. - bool Done = false; - SDNode *I = SelectIndexedLoad(Node, Done); - if (Done) - return I; + if (tryIndexedLoad(Node)) + return; break; } case ISD::SRL: case ISD::AND: case ISD::SRA: - if (SDNode *I = SelectBitfieldExtractOp(Node)) - return I; - if (SDNode *I = SelectBitfieldInsertInZeroOp(Node)) - return I; + case ISD::SIGN_EXTEND_INREG: + if (tryBitfieldExtractOp(Node)) + return; + if (tryBitfieldInsertInZeroOp(Node)) + return; + break; + + case ISD::SIGN_EXTEND: + if (tryBitfieldExtractOpFromSExt(Node)) + return; break; case ISD::OR: - if (SDNode *I = SelectBitfieldInsertOp(Node)) - return I; + if (tryBitfieldInsertOp(Node)) + return; break; case ISD::EXTRACT_VECTOR_ELT: { @@ -2401,19 +2659,25 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { DEBUG(dbgs() << "ISEL: Custom selection!\n=> "); DEBUG(Extract->dumpr(CurDAG)); DEBUG(dbgs() << "\n"); - return Extract.getNode(); + ReplaceNode(Node, Extract.getNode()); + return; } case ISD::Constant: { // Materialize zero constants as copies from WZR/XZR. This allows // the coalescer to propagate these into other instructions. ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node); if (ConstNode->isNullValue()) { - if (VT == MVT::i32) - return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node), - AArch64::WZR, MVT::i32).getNode(); - else if (VT == MVT::i64) - return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node), - AArch64::XZR, MVT::i64).getNode(); + if (VT == MVT::i32) { + SDValue New = CurDAG->getCopyFromReg( + CurDAG->getEntryNode(), SDLoc(Node), AArch64::WZR, MVT::i32); + ReplaceNode(Node, New.getNode()); + return; + } else if (VT == MVT::i64) { + SDValue New = CurDAG->getCopyFromReg( + CurDAG->getEntryNode(), SDLoc(Node), AArch64::XZR, MVT::i64); + ReplaceNode(Node, New.getNode()); + return; + } } break; } @@ -2428,7 +2692,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { SDLoc DL(Node); SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, DL, MVT::i32), CurDAG->getTargetConstant(Shifter, DL, MVT::i32) }; - return CurDAG->SelectNodeTo(Node, AArch64::ADDXri, MVT::i64, Ops); + CurDAG->SelectNodeTo(Node, AArch64::ADDXri, MVT::i64, Ops); + return; } case ISD::INTRINSIC_W_CHAIN: { unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue(); @@ -2450,7 +2715,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand(); cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1); - return Ld; + ReplaceNode(Node, Ld); + return; } case Intrinsic::aarch64_stlxp: case Intrinsic::aarch64_stxp: { @@ -2471,208 +2737,305 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand(); cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1); - return St; + ReplaceNode(Node, St); + return; } case Intrinsic::aarch64_neon_ld1x2: - if (VT == MVT::v8i8) - return SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectLoad(Node, 2, AArch64::LD1Twov4s, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectLoad(Node, 2, AArch64::LD1Twov2d, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectLoad(Node, 2, AArch64::LD1Twov4s, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectLoad(Node, 2, AArch64::LD1Twov2d, AArch64::qsub0); + return; + } break; case Intrinsic::aarch64_neon_ld1x3: - if (VT == MVT::v8i8) - return SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectLoad(Node, 3, AArch64::LD1Threev4s, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectLoad(Node, 3, AArch64::LD1Threev2d, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectLoad(Node, 3, AArch64::LD1Threev4s, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectLoad(Node, 3, AArch64::LD1Threev2d, AArch64::qsub0); + return; + } break; case Intrinsic::aarch64_neon_ld1x4: - if (VT == MVT::v8i8) - return SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectLoad(Node, 4, AArch64::LD1Fourv4s, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectLoad(Node, 4, AArch64::LD1Fourv2d, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectLoad(Node, 4, AArch64::LD1Fourv4s, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectLoad(Node, 4, AArch64::LD1Fourv2d, AArch64::qsub0); + return; + } break; case Intrinsic::aarch64_neon_ld2: - if (VT == MVT::v8i8) - return SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectLoad(Node, 2, AArch64::LD2Twov4s, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectLoad(Node, 2, AArch64::LD2Twov2d, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectLoad(Node, 2, AArch64::LD2Twov4s, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectLoad(Node, 2, AArch64::LD2Twov2d, AArch64::qsub0); + return; + } break; case Intrinsic::aarch64_neon_ld3: - if (VT == MVT::v8i8) - return SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectLoad(Node, 3, AArch64::LD3Threev4s, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectLoad(Node, 3, AArch64::LD3Threev2d, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectLoad(Node, 3, AArch64::LD3Threev4s, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectLoad(Node, 3, AArch64::LD3Threev2d, AArch64::qsub0); + return; + } break; case Intrinsic::aarch64_neon_ld4: - if (VT == MVT::v8i8) - return SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectLoad(Node, 4, AArch64::LD4Fourv4s, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectLoad(Node, 4, AArch64::LD4Fourv2d, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectLoad(Node, 4, AArch64::LD4Fourv4s, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectLoad(Node, 4, AArch64::LD4Fourv2d, AArch64::qsub0); + return; + } break; case Intrinsic::aarch64_neon_ld2r: - if (VT == MVT::v8i8) - return SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectLoad(Node, 2, AArch64::LD2Rv4s, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectLoad(Node, 2, AArch64::LD2Rv1d, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectLoad(Node, 2, AArch64::LD2Rv2d, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectLoad(Node, 2, AArch64::LD2Rv4s, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectLoad(Node, 2, AArch64::LD2Rv1d, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectLoad(Node, 2, AArch64::LD2Rv2d, AArch64::qsub0); + return; + } break; case Intrinsic::aarch64_neon_ld3r: - if (VT == MVT::v8i8) - return SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectLoad(Node, 3, AArch64::LD3Rv4s, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectLoad(Node, 3, AArch64::LD3Rv1d, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectLoad(Node, 3, AArch64::LD3Rv2d, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectLoad(Node, 3, AArch64::LD3Rv4s, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectLoad(Node, 3, AArch64::LD3Rv1d, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectLoad(Node, 3, AArch64::LD3Rv2d, AArch64::qsub0); + return; + } break; case Intrinsic::aarch64_neon_ld4r: - if (VT == MVT::v8i8) - return SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectLoad(Node, 4, AArch64::LD4Rv4s, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectLoad(Node, 4, AArch64::LD4Rv1d, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectLoad(Node, 4, AArch64::LD4Rv2d, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectLoad(Node, 4, AArch64::LD4Rv4s, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectLoad(Node, 4, AArch64::LD4Rv1d, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectLoad(Node, 4, AArch64::LD4Rv2d, AArch64::qsub0); + return; + } break; case Intrinsic::aarch64_neon_ld2lane: - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectLoadLane(Node, 2, AArch64::LD2i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) - return SelectLoadLane(Node, 2, AArch64::LD2i16); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectLoadLane(Node, 2, AArch64::LD2i32); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectLoadLane(Node, 2, AArch64::LD2i64); + if (VT == MVT::v16i8 || VT == MVT::v8i8) { + SelectLoadLane(Node, 2, AArch64::LD2i8); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) { + SelectLoadLane(Node, 2, AArch64::LD2i16); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) { + SelectLoadLane(Node, 2, AArch64::LD2i32); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) { + SelectLoadLane(Node, 2, AArch64::LD2i64); + return; + } break; case Intrinsic::aarch64_neon_ld3lane: - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectLoadLane(Node, 3, AArch64::LD3i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) - return SelectLoadLane(Node, 3, AArch64::LD3i16); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectLoadLane(Node, 3, AArch64::LD3i32); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectLoadLane(Node, 3, AArch64::LD3i64); + if (VT == MVT::v16i8 || VT == MVT::v8i8) { + SelectLoadLane(Node, 3, AArch64::LD3i8); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) { + SelectLoadLane(Node, 3, AArch64::LD3i16); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) { + SelectLoadLane(Node, 3, AArch64::LD3i32); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) { + SelectLoadLane(Node, 3, AArch64::LD3i64); + return; + } break; case Intrinsic::aarch64_neon_ld4lane: - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectLoadLane(Node, 4, AArch64::LD4i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) - return SelectLoadLane(Node, 4, AArch64::LD4i16); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectLoadLane(Node, 4, AArch64::LD4i32); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectLoadLane(Node, 4, AArch64::LD4i64); + if (VT == MVT::v16i8 || VT == MVT::v8i8) { + SelectLoadLane(Node, 4, AArch64::LD4i8); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) { + SelectLoadLane(Node, 4, AArch64::LD4i16); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) { + SelectLoadLane(Node, 4, AArch64::LD4i32); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) { + SelectLoadLane(Node, 4, AArch64::LD4i64); + return; + } break; } } break; @@ -2682,33 +3045,39 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { default: break; case Intrinsic::aarch64_neon_tbl2: - return SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBLv8i8Two - : AArch64::TBLv16i8Two, - false); + SelectTable(Node, 2, + VT == MVT::v8i8 ? AArch64::TBLv8i8Two : AArch64::TBLv16i8Two, + false); + return; case Intrinsic::aarch64_neon_tbl3: - return SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBLv8i8Three - : AArch64::TBLv16i8Three, - false); + SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBLv8i8Three + : AArch64::TBLv16i8Three, + false); + return; case Intrinsic::aarch64_neon_tbl4: - return SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBLv8i8Four - : AArch64::TBLv16i8Four, - false); + SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBLv8i8Four + : AArch64::TBLv16i8Four, + false); + return; case Intrinsic::aarch64_neon_tbx2: - return SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBXv8i8Two - : AArch64::TBXv16i8Two, - true); + SelectTable(Node, 2, + VT == MVT::v8i8 ? AArch64::TBXv8i8Two : AArch64::TBXv16i8Two, + true); + return; case Intrinsic::aarch64_neon_tbx3: - return SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBXv8i8Three - : AArch64::TBXv16i8Three, - true); + SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBXv8i8Three + : AArch64::TBXv16i8Three, + true); + return; case Intrinsic::aarch64_neon_tbx4: - return SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBXv8i8Four - : AArch64::TBXv16i8Four, - true); + SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBXv8i8Four + : AArch64::TBXv16i8Four, + true); + return; case Intrinsic::aarch64_neon_smull: case Intrinsic::aarch64_neon_umull: - if (SDNode *N = SelectMULLV64LaneV128(IntNo, Node)) - return N; + if (tryMULLV64LaneV128(IntNo, Node)) + return; break; } break; @@ -2721,588 +3090,827 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { default: break; case Intrinsic::aarch64_neon_st1x2: { - if (VT == MVT::v8i8) - return SelectStore(Node, 2, AArch64::ST1Twov8b); - else if (VT == MVT::v16i8) - return SelectStore(Node, 2, AArch64::ST1Twov16b); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectStore(Node, 2, AArch64::ST1Twov4h); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectStore(Node, 2, AArch64::ST1Twov8h); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectStore(Node, 2, AArch64::ST1Twov2s); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectStore(Node, 2, AArch64::ST1Twov4s); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectStore(Node, 2, AArch64::ST1Twov2d); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectStore(Node, 2, AArch64::ST1Twov1d); + if (VT == MVT::v8i8) { + SelectStore(Node, 2, AArch64::ST1Twov8b); + return; + } else if (VT == MVT::v16i8) { + SelectStore(Node, 2, AArch64::ST1Twov16b); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectStore(Node, 2, AArch64::ST1Twov4h); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectStore(Node, 2, AArch64::ST1Twov8h); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectStore(Node, 2, AArch64::ST1Twov2s); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectStore(Node, 2, AArch64::ST1Twov4s); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectStore(Node, 2, AArch64::ST1Twov2d); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectStore(Node, 2, AArch64::ST1Twov1d); + return; + } break; } case Intrinsic::aarch64_neon_st1x3: { - if (VT == MVT::v8i8) - return SelectStore(Node, 3, AArch64::ST1Threev8b); - else if (VT == MVT::v16i8) - return SelectStore(Node, 3, AArch64::ST1Threev16b); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectStore(Node, 3, AArch64::ST1Threev4h); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectStore(Node, 3, AArch64::ST1Threev8h); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectStore(Node, 3, AArch64::ST1Threev2s); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectStore(Node, 3, AArch64::ST1Threev4s); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectStore(Node, 3, AArch64::ST1Threev2d); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectStore(Node, 3, AArch64::ST1Threev1d); + if (VT == MVT::v8i8) { + SelectStore(Node, 3, AArch64::ST1Threev8b); + return; + } else if (VT == MVT::v16i8) { + SelectStore(Node, 3, AArch64::ST1Threev16b); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectStore(Node, 3, AArch64::ST1Threev4h); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectStore(Node, 3, AArch64::ST1Threev8h); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectStore(Node, 3, AArch64::ST1Threev2s); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectStore(Node, 3, AArch64::ST1Threev4s); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectStore(Node, 3, AArch64::ST1Threev2d); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectStore(Node, 3, AArch64::ST1Threev1d); + return; + } break; } case Intrinsic::aarch64_neon_st1x4: { - if (VT == MVT::v8i8) - return SelectStore(Node, 4, AArch64::ST1Fourv8b); - else if (VT == MVT::v16i8) - return SelectStore(Node, 4, AArch64::ST1Fourv16b); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectStore(Node, 4, AArch64::ST1Fourv4h); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectStore(Node, 4, AArch64::ST1Fourv8h); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectStore(Node, 4, AArch64::ST1Fourv2s); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectStore(Node, 4, AArch64::ST1Fourv4s); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectStore(Node, 4, AArch64::ST1Fourv2d); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectStore(Node, 4, AArch64::ST1Fourv1d); + if (VT == MVT::v8i8) { + SelectStore(Node, 4, AArch64::ST1Fourv8b); + return; + } else if (VT == MVT::v16i8) { + SelectStore(Node, 4, AArch64::ST1Fourv16b); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectStore(Node, 4, AArch64::ST1Fourv4h); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectStore(Node, 4, AArch64::ST1Fourv8h); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectStore(Node, 4, AArch64::ST1Fourv2s); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectStore(Node, 4, AArch64::ST1Fourv4s); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectStore(Node, 4, AArch64::ST1Fourv2d); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectStore(Node, 4, AArch64::ST1Fourv1d); + return; + } break; } case Intrinsic::aarch64_neon_st2: { - if (VT == MVT::v8i8) - return SelectStore(Node, 2, AArch64::ST2Twov8b); - else if (VT == MVT::v16i8) - return SelectStore(Node, 2, AArch64::ST2Twov16b); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectStore(Node, 2, AArch64::ST2Twov4h); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectStore(Node, 2, AArch64::ST2Twov8h); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectStore(Node, 2, AArch64::ST2Twov2s); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectStore(Node, 2, AArch64::ST2Twov4s); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectStore(Node, 2, AArch64::ST2Twov2d); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectStore(Node, 2, AArch64::ST1Twov1d); + if (VT == MVT::v8i8) { + SelectStore(Node, 2, AArch64::ST2Twov8b); + return; + } else if (VT == MVT::v16i8) { + SelectStore(Node, 2, AArch64::ST2Twov16b); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectStore(Node, 2, AArch64::ST2Twov4h); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectStore(Node, 2, AArch64::ST2Twov8h); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectStore(Node, 2, AArch64::ST2Twov2s); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectStore(Node, 2, AArch64::ST2Twov4s); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectStore(Node, 2, AArch64::ST2Twov2d); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectStore(Node, 2, AArch64::ST1Twov1d); + return; + } break; } case Intrinsic::aarch64_neon_st3: { - if (VT == MVT::v8i8) - return SelectStore(Node, 3, AArch64::ST3Threev8b); - else if (VT == MVT::v16i8) - return SelectStore(Node, 3, AArch64::ST3Threev16b); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectStore(Node, 3, AArch64::ST3Threev4h); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectStore(Node, 3, AArch64::ST3Threev8h); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectStore(Node, 3, AArch64::ST3Threev2s); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectStore(Node, 3, AArch64::ST3Threev4s); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectStore(Node, 3, AArch64::ST3Threev2d); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectStore(Node, 3, AArch64::ST1Threev1d); + if (VT == MVT::v8i8) { + SelectStore(Node, 3, AArch64::ST3Threev8b); + return; + } else if (VT == MVT::v16i8) { + SelectStore(Node, 3, AArch64::ST3Threev16b); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectStore(Node, 3, AArch64::ST3Threev4h); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectStore(Node, 3, AArch64::ST3Threev8h); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectStore(Node, 3, AArch64::ST3Threev2s); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectStore(Node, 3, AArch64::ST3Threev4s); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectStore(Node, 3, AArch64::ST3Threev2d); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectStore(Node, 3, AArch64::ST1Threev1d); + return; + } break; } case Intrinsic::aarch64_neon_st4: { - if (VT == MVT::v8i8) - return SelectStore(Node, 4, AArch64::ST4Fourv8b); - else if (VT == MVT::v16i8) - return SelectStore(Node, 4, AArch64::ST4Fourv16b); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectStore(Node, 4, AArch64::ST4Fourv4h); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectStore(Node, 4, AArch64::ST4Fourv8h); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectStore(Node, 4, AArch64::ST4Fourv2s); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectStore(Node, 4, AArch64::ST4Fourv4s); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectStore(Node, 4, AArch64::ST4Fourv2d); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectStore(Node, 4, AArch64::ST1Fourv1d); + if (VT == MVT::v8i8) { + SelectStore(Node, 4, AArch64::ST4Fourv8b); + return; + } else if (VT == MVT::v16i8) { + SelectStore(Node, 4, AArch64::ST4Fourv16b); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectStore(Node, 4, AArch64::ST4Fourv4h); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectStore(Node, 4, AArch64::ST4Fourv8h); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectStore(Node, 4, AArch64::ST4Fourv2s); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectStore(Node, 4, AArch64::ST4Fourv4s); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectStore(Node, 4, AArch64::ST4Fourv2d); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectStore(Node, 4, AArch64::ST1Fourv1d); + return; + } break; } case Intrinsic::aarch64_neon_st2lane: { - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectStoreLane(Node, 2, AArch64::ST2i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) - return SelectStoreLane(Node, 2, AArch64::ST2i16); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectStoreLane(Node, 2, AArch64::ST2i32); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectStoreLane(Node, 2, AArch64::ST2i64); + if (VT == MVT::v16i8 || VT == MVT::v8i8) { + SelectStoreLane(Node, 2, AArch64::ST2i8); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) { + SelectStoreLane(Node, 2, AArch64::ST2i16); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) { + SelectStoreLane(Node, 2, AArch64::ST2i32); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) { + SelectStoreLane(Node, 2, AArch64::ST2i64); + return; + } break; } case Intrinsic::aarch64_neon_st3lane: { - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectStoreLane(Node, 3, AArch64::ST3i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) - return SelectStoreLane(Node, 3, AArch64::ST3i16); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectStoreLane(Node, 3, AArch64::ST3i32); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectStoreLane(Node, 3, AArch64::ST3i64); + if (VT == MVT::v16i8 || VT == MVT::v8i8) { + SelectStoreLane(Node, 3, AArch64::ST3i8); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) { + SelectStoreLane(Node, 3, AArch64::ST3i16); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) { + SelectStoreLane(Node, 3, AArch64::ST3i32); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) { + SelectStoreLane(Node, 3, AArch64::ST3i64); + return; + } break; } case Intrinsic::aarch64_neon_st4lane: { - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectStoreLane(Node, 4, AArch64::ST4i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) - return SelectStoreLane(Node, 4, AArch64::ST4i16); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectStoreLane(Node, 4, AArch64::ST4i32); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectStoreLane(Node, 4, AArch64::ST4i64); + if (VT == MVT::v16i8 || VT == MVT::v8i8) { + SelectStoreLane(Node, 4, AArch64::ST4i8); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) { + SelectStoreLane(Node, 4, AArch64::ST4i16); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) { + SelectStoreLane(Node, 4, AArch64::ST4i32); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) { + SelectStoreLane(Node, 4, AArch64::ST4i64); + return; + } break; } } break; } case AArch64ISD::LD2post: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 2, AArch64::LD2Twov4s_POST, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 2, AArch64::LD2Twov2d_POST, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostLoad(Node, 2, AArch64::LD2Twov4s_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostLoad(Node, 2, AArch64::LD2Twov2d_POST, AArch64::qsub0); + return; + } break; } case AArch64ISD::LD3post: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 3, AArch64::LD3Threev4s_POST, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 3, AArch64::LD3Threev2d_POST, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostLoad(Node, 3, AArch64::LD3Threev4s_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostLoad(Node, 3, AArch64::LD3Threev2d_POST, AArch64::qsub0); + return; + } break; } case AArch64ISD::LD4post: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 4, AArch64::LD4Fourv4s_POST, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 4, AArch64::LD4Fourv2d_POST, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostLoad(Node, 4, AArch64::LD4Fourv4s_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostLoad(Node, 4, AArch64::LD4Fourv2d_POST, AArch64::qsub0); + return; + } break; } case AArch64ISD::LD1x2post: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 2, AArch64::LD1Twov4s_POST, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 2, AArch64::LD1Twov2d_POST, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostLoad(Node, 2, AArch64::LD1Twov4s_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostLoad(Node, 2, AArch64::LD1Twov2d_POST, AArch64::qsub0); + return; + } break; } case AArch64ISD::LD1x3post: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 3, AArch64::LD1Threev4s_POST, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 3, AArch64::LD1Threev2d_POST, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostLoad(Node, 3, AArch64::LD1Threev4s_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostLoad(Node, 3, AArch64::LD1Threev2d_POST, AArch64::qsub0); + return; + } break; } case AArch64ISD::LD1x4post: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 4, AArch64::LD1Fourv4s_POST, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 4, AArch64::LD1Fourv2d_POST, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostLoad(Node, 4, AArch64::LD1Fourv4s_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostLoad(Node, 4, AArch64::LD1Fourv2d_POST, AArch64::qsub0); + return; + } break; } case AArch64ISD::LD1DUPpost: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 1, AArch64::LD1Rv4s_POST, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 1, AArch64::LD1Rv1d_POST, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 1, AArch64::LD1Rv2d_POST, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostLoad(Node, 1, AArch64::LD1Rv4s_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostLoad(Node, 1, AArch64::LD1Rv1d_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostLoad(Node, 1, AArch64::LD1Rv2d_POST, AArch64::qsub0); + return; + } break; } case AArch64ISD::LD2DUPpost: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 2, AArch64::LD2Rv4s_POST, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 2, AArch64::LD2Rv1d_POST, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 2, AArch64::LD2Rv2d_POST, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostLoad(Node, 2, AArch64::LD2Rv4s_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostLoad(Node, 2, AArch64::LD2Rv1d_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostLoad(Node, 2, AArch64::LD2Rv2d_POST, AArch64::qsub0); + return; + } break; } case AArch64ISD::LD3DUPpost: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 3, AArch64::LD3Rv4s_POST, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 3, AArch64::LD3Rv1d_POST, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 3, AArch64::LD3Rv2d_POST, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostLoad(Node, 3, AArch64::LD3Rv4s_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostLoad(Node, 3, AArch64::LD3Rv1d_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostLoad(Node, 3, AArch64::LD3Rv2d_POST, AArch64::qsub0); + return; + } break; } case AArch64ISD::LD4DUPpost: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 4, AArch64::LD4Rv4s_POST, AArch64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 4, AArch64::LD4Rv1d_POST, AArch64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 4, AArch64::LD4Rv2d_POST, AArch64::qsub0); + if (VT == MVT::v8i8) { + SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v16i8) { + SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostLoad(Node, 4, AArch64::LD4Rv4s_POST, AArch64::qsub0); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostLoad(Node, 4, AArch64::LD4Rv1d_POST, AArch64::dsub0); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostLoad(Node, 4, AArch64::LD4Rv2d_POST, AArch64::qsub0); + return; + } break; } case AArch64ISD::LD1LANEpost: { - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) - return SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectPostLoadLane(Node, 1, AArch64::LD1i32_POST); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectPostLoadLane(Node, 1, AArch64::LD1i64_POST); + if (VT == MVT::v16i8 || VT == MVT::v8i8) { + SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) { + SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) { + SelectPostLoadLane(Node, 1, AArch64::LD1i32_POST); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) { + SelectPostLoadLane(Node, 1, AArch64::LD1i64_POST); + return; + } break; } case AArch64ISD::LD2LANEpost: { - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) - return SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectPostLoadLane(Node, 2, AArch64::LD2i32_POST); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectPostLoadLane(Node, 2, AArch64::LD2i64_POST); + if (VT == MVT::v16i8 || VT == MVT::v8i8) { + SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) { + SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) { + SelectPostLoadLane(Node, 2, AArch64::LD2i32_POST); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) { + SelectPostLoadLane(Node, 2, AArch64::LD2i64_POST); + return; + } break; } case AArch64ISD::LD3LANEpost: { - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) - return SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectPostLoadLane(Node, 3, AArch64::LD3i32_POST); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectPostLoadLane(Node, 3, AArch64::LD3i64_POST); + if (VT == MVT::v16i8 || VT == MVT::v8i8) { + SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) { + SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) { + SelectPostLoadLane(Node, 3, AArch64::LD3i32_POST); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) { + SelectPostLoadLane(Node, 3, AArch64::LD3i64_POST); + return; + } break; } case AArch64ISD::LD4LANEpost: { - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) - return SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectPostLoadLane(Node, 4, AArch64::LD4i32_POST); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectPostLoadLane(Node, 4, AArch64::LD4i64_POST); + if (VT == MVT::v16i8 || VT == MVT::v8i8) { + SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) { + SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) { + SelectPostLoadLane(Node, 4, AArch64::LD4i32_POST); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) { + SelectPostLoadLane(Node, 4, AArch64::LD4i64_POST); + return; + } break; } case AArch64ISD::ST2post: { VT = Node->getOperand(1).getValueType(); - if (VT == MVT::v8i8) - return SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST); - else if (VT == MVT::v16i8) - return SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostStore(Node, 2, AArch64::ST2Twov4s_POST); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostStore(Node, 2, AArch64::ST2Twov2d_POST); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST); + if (VT == MVT::v8i8) { + SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST); + return; + } else if (VT == MVT::v16i8) { + SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostStore(Node, 2, AArch64::ST2Twov4s_POST); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostStore(Node, 2, AArch64::ST2Twov2d_POST); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST); + return; + } break; } case AArch64ISD::ST3post: { VT = Node->getOperand(1).getValueType(); - if (VT == MVT::v8i8) - return SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST); - else if (VT == MVT::v16i8) - return SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostStore(Node, 3, AArch64::ST3Threev4s_POST); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostStore(Node, 3, AArch64::ST3Threev2d_POST); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST); + if (VT == MVT::v8i8) { + SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST); + return; + } else if (VT == MVT::v16i8) { + SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostStore(Node, 3, AArch64::ST3Threev4s_POST); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostStore(Node, 3, AArch64::ST3Threev2d_POST); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST); + return; + } break; } case AArch64ISD::ST4post: { VT = Node->getOperand(1).getValueType(); - if (VT == MVT::v8i8) - return SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST); - else if (VT == MVT::v16i8) - return SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostStore(Node, 4, AArch64::ST4Fourv4s_POST); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostStore(Node, 4, AArch64::ST4Fourv2d_POST); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST); + if (VT == MVT::v8i8) { + SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST); + return; + } else if (VT == MVT::v16i8) { + SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostStore(Node, 4, AArch64::ST4Fourv4s_POST); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostStore(Node, 4, AArch64::ST4Fourv2d_POST); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST); + return; + } break; } case AArch64ISD::ST1x2post: { VT = Node->getOperand(1).getValueType(); - if (VT == MVT::v8i8) - return SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST); - else if (VT == MVT::v16i8) - return SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostStore(Node, 2, AArch64::ST1Twov4s_POST); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostStore(Node, 2, AArch64::ST1Twov2d_POST); + if (VT == MVT::v8i8) { + SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST); + return; + } else if (VT == MVT::v16i8) { + SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostStore(Node, 2, AArch64::ST1Twov4s_POST); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostStore(Node, 2, AArch64::ST1Twov2d_POST); + return; + } break; } case AArch64ISD::ST1x3post: { VT = Node->getOperand(1).getValueType(); - if (VT == MVT::v8i8) - return SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST); - else if (VT == MVT::v16i8) - return SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostStore(Node, 3, AArch64::ST1Threev4s_POST); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostStore(Node, 3, AArch64::ST1Threev2d_POST); + if (VT == MVT::v8i8) { + SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST); + return; + } else if (VT == MVT::v16i8) { + SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostStore(Node, 3, AArch64::ST1Threev4s_POST); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostStore(Node, 3, AArch64::ST1Threev2d_POST); + return; + } break; } case AArch64ISD::ST1x4post: { VT = Node->getOperand(1).getValueType(); - if (VT == MVT::v8i8) - return SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST); - else if (VT == MVT::v16i8) - return SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST); - else if (VT == MVT::v4i16 || VT == MVT::v4f16) - return SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST); - else if (VT == MVT::v8i16 || VT == MVT::v8f16) - return SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostStore(Node, 4, AArch64::ST1Fourv4s_POST); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostStore(Node, 4, AArch64::ST1Fourv2d_POST); + if (VT == MVT::v8i8) { + SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST); + return; + } else if (VT == MVT::v16i8) { + SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST); + return; + } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST); + return; + } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { + SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v4f32) { + SelectPostStore(Node, 4, AArch64::ST1Fourv4s_POST); + return; + } else if (VT == MVT::v1i64 || VT == MVT::v1f64) { + SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v2f64) { + SelectPostStore(Node, 4, AArch64::ST1Fourv2d_POST); + return; + } break; } case AArch64ISD::ST2LANEpost: { VT = Node->getOperand(1).getValueType(); - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) - return SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectPostStoreLane(Node, 2, AArch64::ST2i32_POST); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectPostStoreLane(Node, 2, AArch64::ST2i64_POST); + if (VT == MVT::v16i8 || VT == MVT::v8i8) { + SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) { + SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) { + SelectPostStoreLane(Node, 2, AArch64::ST2i32_POST); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) { + SelectPostStoreLane(Node, 2, AArch64::ST2i64_POST); + return; + } break; } case AArch64ISD::ST3LANEpost: { VT = Node->getOperand(1).getValueType(); - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) - return SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectPostStoreLane(Node, 3, AArch64::ST3i32_POST); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectPostStoreLane(Node, 3, AArch64::ST3i64_POST); + if (VT == MVT::v16i8 || VT == MVT::v8i8) { + SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) { + SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) { + SelectPostStoreLane(Node, 3, AArch64::ST3i32_POST); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) { + SelectPostStoreLane(Node, 3, AArch64::ST3i64_POST); + return; + } break; } case AArch64ISD::ST4LANEpost: { VT = Node->getOperand(1).getValueType(); - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) - return SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectPostStoreLane(Node, 4, AArch64::ST4i32_POST); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST); + if (VT == MVT::v16i8 || VT == MVT::v8i8) { + SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST); + return; + } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) { + SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST); + return; + } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) { + SelectPostStoreLane(Node, 4, AArch64::ST4i32_POST); + return; + } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) { + SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST); + return; + } break; } } // Select the default instruction - ResNode = SelectCode(Node); - - DEBUG(errs() << "=> "); - if (ResNode == nullptr || ResNode == Node) - DEBUG(Node->dump(CurDAG)); - else - DEBUG(ResNode->dump(CurDAG)); - DEBUG(errs() << "\n"); - - return ResNode; + SelectCode(Node); } /// createAArch64ISelDag - This pass converts a legalized DAG into a diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 92cf1cd71970b..d6f2a190d4c85 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -40,12 +40,6 @@ using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumShiftInserts, "Number of vector shift inserts"); -// Place holder until extr generation is tested fully. -static cl::opt<bool> -EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden, - cl::desc("Allow AArch64 (or (shift)(shift))->extract"), - cl::init(true)); - static cl::opt<bool> EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, cl::desc("Allow AArch64 SLI/SRI formation"), @@ -59,6 +53,13 @@ cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration( cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false)); +// Disabled for causing self-hosting failures once returned-attribute inference +// was enabled. +static cl::opt<bool> +EnableThisRetForwarding("aarch64-this-return-forwarding", cl::Hidden, + cl::desc("Directly forward this return"), + cl::init(false)); + /// Value type used for condition codes. static const MVT MVT_CC = MVT::i32; @@ -225,13 +226,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); - // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero - // counterparts, which AArch64 supports directly. - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); - setOperationAction(ISD::CTPOP, MVT::i32, Custom); setOperationAction(ISD::CTPOP, MVT::i64, Custom); @@ -402,6 +396,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::PREFETCH, MVT::Other, Custom); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); + // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. // This requires the Performance Monitors extension. if (Subtarget->hasPerfMon()) @@ -476,7 +472,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // Also, try to fold ADD into CSINC/CSINV.. setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::SUB); - + setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::XOR); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); @@ -518,7 +514,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, MaskAndBranchFoldingIsLegal = true; EnableExtLdPromotion = true; + // Set required alignment. setMinFunctionAlignment(2); + // Set preferred alignments. + setPrefFunctionAlignment(STI.getPrefFunctionAlignment()); + setPrefLoopAlignment(STI.getPrefLoopAlignment()); setHasExtractBitsInsn(true); @@ -583,6 +583,18 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); + setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); + setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); + + setOperationAction(ISD::CTTZ, MVT::v2i8, Expand); + setOperationAction(ISD::CTTZ, MVT::v4i16, Expand); + setOperationAction(ISD::CTTZ, MVT::v2i32, Expand); + setOperationAction(ISD::CTTZ, MVT::v1i64, Expand); + setOperationAction(ISD::CTTZ, MVT::v16i8, Expand); + setOperationAction(ISD::CTTZ, MVT::v8i16, Expand); + setOperationAction(ISD::CTTZ, MVT::v4i32, Expand); + setOperationAction(ISD::CTTZ, MVT::v2i64, Expand); + // AArch64 doesn't have MUL.2d: setOperationAction(ISD::MUL, MVT::v2i64, Expand); // Custom handling for some quad-vector types to detect MULL. @@ -623,91 +635,88 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, } } - // Prefer likely predicted branches to selects on out-of-order cores. - if (Subtarget->isCortexA57()) - PredictableSelectIsExpensive = true; + PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); } -void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) { +void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) { if (VT == MVT::v2f32 || VT == MVT::v4f16) { - setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); - AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32); + setOperationAction(ISD::LOAD, VT, Promote); + AddPromotedToType(ISD::LOAD, VT, MVT::v2i32); - setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); - AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32); + setOperationAction(ISD::STORE, VT, Promote); + AddPromotedToType(ISD::STORE, VT, MVT::v2i32); } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) { - setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); - AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64); + setOperationAction(ISD::LOAD, VT, Promote); + AddPromotedToType(ISD::LOAD, VT, MVT::v2i64); - setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); - AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64); + setOperationAction(ISD::STORE, VT, Promote); + AddPromotedToType(ISD::STORE, VT, MVT::v2i64); } // Mark vector float intrinsics as expand. if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) { - setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FSIN, VT, Expand); + setOperationAction(ISD::FCOS, VT, Expand); + setOperationAction(ISD::FPOWI, VT, Expand); + setOperationAction(ISD::FPOW, VT, Expand); + setOperationAction(ISD::FLOG, VT, Expand); + setOperationAction(ISD::FLOG2, VT, Expand); + setOperationAction(ISD::FLOG10, VT, Expand); + setOperationAction(ISD::FEXP, VT, Expand); + setOperationAction(ISD::FEXP2, VT, Expand); // But we do support custom-lowering for FCOPYSIGN. - setOperationAction(ISD::FCOPYSIGN, VT.getSimpleVT(), Custom); - } - - setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom); - setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom); - setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom); - setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom); - setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom); - setOperationAction(ISD::AND, VT.getSimpleVT(), Custom); - setOperationAction(ISD::OR, VT.getSimpleVT(), Custom); - setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom); - setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal); - - setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand); - setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand); - setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FCOPYSIGN, VT, Custom); + } + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::AND, VT, Custom); + setOperationAction(ISD::OR, VT, Custom); + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); + + setOperationAction(ISD::SELECT, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction(ISD::VSELECT, VT, Expand); for (MVT InnerVT : MVT::all_valuetypes()) - setLoadExtAction(ISD::EXTLOAD, InnerVT, VT.getSimpleVT(), Expand); + setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); // CNT supports only B element sizes. if (VT != MVT::v8i8 && VT != MVT::v16i8) - setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand); + setOperationAction(ISD::CTPOP, VT, Expand); - setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand); - setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand); - setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand); - setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand); + setOperationAction(ISD::UDIV, VT, Expand); + setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::UREM, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::FREM, VT, Expand); - setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom); - setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom); + setOperationAction(ISD::FP_TO_SINT, VT, Custom); + setOperationAction(ISD::FP_TO_UINT, VT, Custom); // [SU][MIN|MAX] are available for all NEON types apart from i64. - if (!VT.isFloatingPoint() && - VT.getSimpleVT() != MVT::v2i64 && VT.getSimpleVT() != MVT::v1i64) + if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64) for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) - setOperationAction(Opcode, VT.getSimpleVT(), Legal); + setOperationAction(Opcode, VT, Legal); // F[MIN|MAX][NUM|NAN] are available for all FP NEON types (not f16 though!). if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::f16) for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN, ISD::FMINNUM, ISD::FMAXNUM}) - setOperationAction(Opcode, VT.getSimpleVT(), Legal); + setOperationAction(Opcode, VT, Legal); if (Subtarget->isLittleEndian()) { for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { - setIndexedLoadAction(im, VT.getSimpleVT(), Legal); - setIndexedStoreAction(im, VT.getSimpleVT(), Legal); + setIndexedLoadAction(im, VT, Legal); + setIndexedStoreAction(im, VT, Legal); } } } @@ -804,12 +813,9 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, if (Subtarget->requiresStrictAlign()) return false; - // FIXME: This is mostly true for Cyclone, but not necessarily others. if (Fast) { - // FIXME: Define an attribute for slow unaligned accesses instead of - // relying on the CPU type as a proxy. - // On Cyclone, unaligned 128-bit stores are slow. - *Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 || + // Some CPUs are fine with unaligned stores except for 128-bit ones. + *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 || // See comments in performSTORECombine() for more details about // these conditions. @@ -954,12 +960,14 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost"; case AArch64ISD::SMULL: return "AArch64ISD::SMULL"; case AArch64ISD::UMULL: return "AArch64ISD::UMULL"; + case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE"; + case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE"; } return nullptr; } MachineBasicBlock * -AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, +AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *MBB) const { // We materialise the F128CSEL pseudo-instruction as some control flow and a // phi node: @@ -976,14 +984,14 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); - DebugLoc DL = MI->getDebugLoc(); + DebugLoc DL = MI.getDebugLoc(); MachineFunction::iterator It = ++MBB->getIterator(); - unsigned DestReg = MI->getOperand(0).getReg(); - unsigned IfTrueReg = MI->getOperand(1).getReg(); - unsigned IfFalseReg = MI->getOperand(2).getReg(); - unsigned CondCode = MI->getOperand(3).getImm(); - bool NZCVKilled = MI->getOperand(4).isKill(); + unsigned DestReg = MI.getOperand(0).getReg(); + unsigned IfTrueReg = MI.getOperand(1).getReg(); + unsigned IfFalseReg = MI.getOperand(2).getReg(); + unsigned CondCode = MI.getOperand(3).getImm(); + bool NZCVKilled = MI.getOperand(4).isKill(); MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB); @@ -1014,17 +1022,16 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, .addReg(IfFalseReg) .addMBB(MBB); - MI->eraseFromParent(); + MI.eraseFromParent(); return EndBB; } -MachineBasicBlock * -AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, - MachineBasicBlock *BB) const { - switch (MI->getOpcode()) { +MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( + MachineInstr &MI, MachineBasicBlock *BB) const { + switch (MI.getOpcode()) { default: #ifndef NDEBUG - MI->dump(); + MI.dump(); #endif llvm_unreachable("Unexpected instruction for custom inserter!"); @@ -1135,6 +1142,35 @@ static void changeFPCCToAArch64CC(ISD::CondCode CC, } } +/// Convert a DAG fp condition code to an AArch64 CC. +/// This differs from changeFPCCToAArch64CC in that it returns cond codes that +/// should be AND'ed instead of OR'ed. +static void changeFPCCToANDAArch64CC(ISD::CondCode CC, + AArch64CC::CondCode &CondCode, + AArch64CC::CondCode &CondCode2) { + CondCode2 = AArch64CC::AL; + switch (CC) { + default: + changeFPCCToAArch64CC(CC, CondCode, CondCode2); + assert(CondCode2 == AArch64CC::AL); + break; + case ISD::SETONE: + // (a one b) + // == ((a olt b) || (a ogt b)) + // == ((a ord b) && (a une b)) + CondCode = AArch64CC::VC; + CondCode2 = AArch64CC::NE; + break; + case ISD::SETUEQ: + // (a ueq b) + // == ((a uno b) || (a oeq b)) + // == ((a ule b) && (a uge b)) + CondCode = AArch64CC::PL; + CondCode2 = AArch64CC::LE; + break; + } +} + /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 /// CC usable with the vector instructions. Fewer operations are available /// without a real NZCV register, so we have to use less efficient combinations @@ -1174,11 +1210,18 @@ static bool isLegalArithImmed(uint64_t C) { } static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, - SDLoc dl, SelectionDAG &DAG) { + const SDLoc &dl, SelectionDAG &DAG) { EVT VT = LHS.getValueType(); - if (VT.isFloatingPoint()) + if (VT.isFloatingPoint()) { + assert(VT != MVT::f128); + if (VT == MVT::f16) { + LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); + RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); + VT = MVT::f32; + } return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS); + } // The CMP instruction is just an alias for SUBS, and representing it as // SUBS means that it's possible to get CSE with subtract operations. @@ -1258,22 +1301,31 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate. static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, - SDValue Condition, unsigned NZCV, - SDLoc DL, SelectionDAG &DAG) { + AArch64CC::CondCode Predicate, + AArch64CC::CondCode OutCC, + const SDLoc &DL, SelectionDAG &DAG) { unsigned Opcode = 0; - if (LHS.getValueType().isFloatingPoint()) + if (LHS.getValueType().isFloatingPoint()) { + assert(LHS.getValueType() != MVT::f128); + if (LHS.getValueType() == MVT::f16) { + LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS); + RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS); + } Opcode = AArch64ISD::FCCMP; - else if (RHS.getOpcode() == ISD::SUB) { + } else if (RHS.getOpcode() == ISD::SUB) { SDValue SubOp0 = RHS.getOperand(0); if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { - // See emitComparison() on why we can only do this for SETEQ and SETNE. - Opcode = AArch64ISD::CCMN; - RHS = RHS.getOperand(1); - } + // See emitComparison() on why we can only do this for SETEQ and SETNE. + Opcode = AArch64ISD::CCMN; + RHS = RHS.getOperand(1); + } } if (Opcode == 0) Opcode = AArch64ISD::CCMP; + SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC); + AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); + unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp); } @@ -1284,31 +1336,49 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, /// at the leafs only. i.e. "not (or (or x y) z)" can be changed to /// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be /// brought into such a form. -static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanPushNegate, +static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate, unsigned Depth = 0) { if (!Val.hasOneUse()) return false; unsigned Opcode = Val->getOpcode(); if (Opcode == ISD::SETCC) { - CanPushNegate = true; + if (Val->getOperand(0).getValueType() == MVT::f128) + return false; + CanNegate = true; return true; } - // Protect against stack overflow. - if (Depth > 15) + // Protect against exponential runtime and stack overflow. + if (Depth > 6) return false; if (Opcode == ISD::AND || Opcode == ISD::OR) { SDValue O0 = Val->getOperand(0); SDValue O1 = Val->getOperand(1); - bool CanPushNegateL; - if (!isConjunctionDisjunctionTree(O0, CanPushNegateL, Depth+1)) + bool CanNegateL; + if (!isConjunctionDisjunctionTree(O0, CanNegateL, Depth+1)) return false; - bool CanPushNegateR; - if (!isConjunctionDisjunctionTree(O1, CanPushNegateR, Depth+1)) + bool CanNegateR; + if (!isConjunctionDisjunctionTree(O1, CanNegateR, Depth+1)) return false; - // We cannot push a negate through an AND operation (it would become an OR), - // we can however change a (not (or x y)) to (and (not x) (not y)) if we can - // push the negate through the x/y subtrees. - CanPushNegate = (Opcode == ISD::OR) && CanPushNegateL && CanPushNegateR; + + if (Opcode == ISD::OR) { + // For an OR expression we need to be able to negate at least one side or + // we cannot do the transformation at all. + if (!CanNegateL && !CanNegateR) + return false; + // We can however change a (not (or x y)) to (and (not x) (not y)) if we + // can negate the x and y subtrees. + CanNegate = CanNegateL && CanNegateR; + } else { + // If the operands are OR expressions then we finally need to negate their + // outputs, we can only do that for the operand with emitted last by + // negating OutCC, not for both operands. + bool NeedsNegOutL = O0->getOpcode() == ISD::OR; + bool NeedsNegOutR = O1->getOpcode() == ISD::OR; + if (NeedsNegOutL && NeedsNegOutR) + return false; + // We cannot negate an AND operation (it would become an OR), + CanNegate = false; + } return true; } return false; @@ -1324,10 +1394,9 @@ static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanPushNegate, /// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate /// for the comparisons in the current subtree; @p Depth limits the search /// depth to avoid stack overflow. -static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val, - AArch64CC::CondCode &OutCC, bool PushNegate = false, - SDValue CCOp = SDValue(), AArch64CC::CondCode Predicate = AArch64CC::AL, - unsigned Depth = 0) { +static SDValue emitConjunctionDisjunctionTreeRec(SelectionDAG &DAG, SDValue Val, + AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, + AArch64CC::CondCode Predicate) { // We're at a tree leaf, produce a conditional comparison operation. unsigned Opcode = Val->getOpcode(); if (Opcode == ISD::SETCC) { @@ -1335,7 +1404,7 @@ static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val, SDValue RHS = Val->getOperand(1); ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get(); bool isInteger = LHS.getValueType().isInteger(); - if (PushNegate) + if (Negate) CC = getSetCCInverse(CC, isInteger); SDLoc DL(Val); // Determine OutCC and handle FP special case. @@ -1344,68 +1413,62 @@ static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val, } else { assert(LHS.getValueType().isFloatingPoint()); AArch64CC::CondCode ExtraCC; - changeFPCCToAArch64CC(CC, OutCC, ExtraCC); - // Surpisingly some floating point conditions can't be tested with a - // single condition code. Construct an additional comparison in this case. - // See comment below on how we deal with OR conditions. + changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC); + // Some floating point conditions can't be tested with a single condition + // code. Construct an additional comparison in this case. if (ExtraCC != AArch64CC::AL) { SDValue ExtraCmp; if (!CCOp.getNode()) ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG); - else { - SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC); - // Note that we want the inverse of ExtraCC, so NZCV is not inversed. - unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(ExtraCC); - ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, - NZCV, DL, DAG); - } + else + ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, + ExtraCC, DL, DAG); CCOp = ExtraCmp; - Predicate = AArch64CC::getInvertedCondCode(ExtraCC); - OutCC = AArch64CC::getInvertedCondCode(OutCC); + Predicate = ExtraCC; } } // Produce a normal comparison if we are first in the chain - if (!CCOp.getNode()) + if (!CCOp) return emitComparison(LHS, RHS, CC, DL, DAG); // Otherwise produce a ccmp. - SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC); - AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); - unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); - return emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, NZCV, DL, + return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL, DAG); - } else if ((Opcode != ISD::AND && Opcode != ISD::OR) || !Val->hasOneUse()) - return SDValue(); - - assert((Opcode == ISD::OR || !PushNegate) - && "Can only push negate through OR operation"); + } + assert((Opcode == ISD::AND || (Opcode == ISD::OR && Val->hasOneUse())) && + "Valid conjunction/disjunction tree"); // Check if both sides can be transformed. SDValue LHS = Val->getOperand(0); SDValue RHS = Val->getOperand(1); - bool CanPushNegateL; - if (!isConjunctionDisjunctionTree(LHS, CanPushNegateL, Depth+1)) - return SDValue(); - bool CanPushNegateR; - if (!isConjunctionDisjunctionTree(RHS, CanPushNegateR, Depth+1)) - return SDValue(); - // Do we need to negate our operands? - bool NegateOperands = Opcode == ISD::OR; + // In case of an OR we need to negate our operands and the result. + // (A v B) <=> not(not(A) ^ not(B)) + bool NegateOpsAndResult = Opcode == ISD::OR; // We can negate the results of all previous operations by inverting the - // predicate flags giving us a free negation for one side. For the other side - // we need to be able to push the negation to the leafs of the tree. - if (NegateOperands) { - if (!CanPushNegateL && !CanPushNegateR) - return SDValue(); - // Order the side where we can push the negate through to LHS. - if (!CanPushNegateL && CanPushNegateR) + // predicate flags giving us a free negation for one side. The other side + // must be negatable by itself. + if (NegateOpsAndResult) { + // See which side we can negate. + bool CanNegateL; + bool isValidL = isConjunctionDisjunctionTree(LHS, CanNegateL); + assert(isValidL && "Valid conjunction/disjunction tree"); + (void)isValidL; + +#ifndef NDEBUG + bool CanNegateR; + bool isValidR = isConjunctionDisjunctionTree(RHS, CanNegateR); + assert(isValidR && "Valid conjunction/disjunction tree"); + assert((CanNegateL || CanNegateR) && "Valid conjunction/disjunction tree"); +#endif + + // Order the side which we cannot negate to RHS so we can emit it first. + if (!CanNegateL) std::swap(LHS, RHS); } else { bool NeedsNegOutL = LHS->getOpcode() == ISD::OR; - bool NeedsNegOutR = RHS->getOpcode() == ISD::OR; - if (NeedsNegOutL && NeedsNegOutR) - return SDValue(); + assert((!NeedsNegOutL || RHS->getOpcode() != ISD::OR) && + "Valid conjunction/disjunction tree"); // Order the side where we need to negate the output flags to RHS so it // gets emitted first. if (NeedsNegOutL) @@ -1416,24 +1479,39 @@ static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val, // through if we are already in a PushNegate case, otherwise we can negate // the "flags to test" afterwards. AArch64CC::CondCode RHSCC; - SDValue CmpR = emitConjunctionDisjunctionTree(DAG, RHS, RHSCC, PushNegate, - CCOp, Predicate, Depth+1); - if (NegateOperands && !PushNegate) + SDValue CmpR = emitConjunctionDisjunctionTreeRec(DAG, RHS, RHSCC, Negate, + CCOp, Predicate); + if (NegateOpsAndResult && !Negate) RHSCC = AArch64CC::getInvertedCondCode(RHSCC); - // Emit LHS. We must push the negate through if we need to negate it. - SDValue CmpL = emitConjunctionDisjunctionTree(DAG, LHS, OutCC, NegateOperands, - CmpR, RHSCC, Depth+1); + // Emit LHS. We may need to negate it. + SDValue CmpL = emitConjunctionDisjunctionTreeRec(DAG, LHS, OutCC, + NegateOpsAndResult, CmpR, + RHSCC); // If we transformed an OR to and AND then we have to negate the result - // (or absorb a PushNegate resulting in a double negation). - if (Opcode == ISD::OR && !PushNegate) + // (or absorb the Negate parameter). + if (NegateOpsAndResult && !Negate) OutCC = AArch64CC::getInvertedCondCode(OutCC); return CmpL; } +/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain +/// of CCMP/CFCMP ops. See @ref AArch64CCMP. +/// \see emitConjunctionDisjunctionTreeRec(). +static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val, + AArch64CC::CondCode &OutCC) { + bool CanNegate; + if (!isConjunctionDisjunctionTree(Val, CanNegate)) + return SDValue(); + + return emitConjunctionDisjunctionTreeRec(DAG, Val, OutCC, false, SDValue(), + AArch64CC::AL); +} + /// @} static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, - SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) { + SDValue &AArch64cc, SelectionDAG &DAG, + const SDLoc &dl) { if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { EVT VT = RHS.getValueType(); uint64_t C = RHSC->getZExtValue(); @@ -1994,7 +2072,7 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0); + .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args)); std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); return CallResult.first; @@ -2096,8 +2174,7 @@ static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { // The values are implicitly truncated so sext vs. zext doesn't matter. Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); } - return DAG.getNode(ISD::BUILD_VECTOR, dl, - MVT::getVectorVT(TruncVT, NumElts), Ops); + return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); } static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { @@ -2213,7 +2290,7 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDLoc dl(Op); switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. - case Intrinsic::aarch64_thread_pointer: { + case Intrinsic::thread_pointer: { EVT PtrVT = getPointerTy(DAG.getDataLayout()); return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT); } @@ -2356,6 +2433,8 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, return CC_AArch64_GHC; case CallingConv::C: case CallingConv::Fast: + case CallingConv::PreserveMost: + case CallingConv::CXX_FAST_TLS: if (!Subtarget->isTargetDarwin()) return CC_AArch64_AAPCS; return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS; @@ -2364,8 +2443,8 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, SDValue AArch64TargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) const { + const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, + SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -2515,13 +2594,14 @@ SDValue AArch64TargetLowering::LowerFormalArguments( ArgValue = DAG.getExtLoad( ExtType, DL, VA.getLocVT(), Chain, FIN, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), - MemVT, false, false, false, 0); + MemVT); InVals.push_back(ArgValue); } } // varargs + AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); if (isVarArg) { if (!Subtarget->isTargetDarwin()) { // The AAPCS variadic function ABI is identical to the non-variadic @@ -2530,22 +2610,20 @@ SDValue AArch64TargetLowering::LowerFormalArguments( saveVarArgRegisters(CCInfo, DAG, DL, Chain); } - AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); // This will point to the next argument passed via stack. unsigned StackOffset = CCInfo.getNextStackOffset(); // We currently pass all varargs at 8-byte alignment. StackOffset = ((StackOffset + 7) & ~7); - AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true)); + FuncInfo->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true)); } - AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); unsigned StackArgSize = CCInfo.getNextStackOffset(); bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { // This is a non-standard ABI so by fiat I say we're allowed to make full // use of the stack area to be popped, which must be aligned to 16 bytes in // any case: - StackArgSize = RoundUpToAlignment(StackArgSize, 16); + StackArgSize = alignTo(StackArgSize, 16); // If we're expected to restore the stack (e.g. fastcc) then we'll be adding // a multiple of 16. @@ -2563,7 +2641,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments( } void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, - SelectionDAG &DAG, SDLoc DL, + SelectionDAG &DAG, + const SDLoc &DL, SDValue &Chain) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -2590,8 +2669,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); SDValue Store = DAG.getStore( Val.getValue(1), DL, Val, FIN, - MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8), false, - false, 0); + MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8)); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT)); @@ -2620,8 +2698,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, SDValue Store = DAG.getStore( Val.getValue(1), DL, Val, FIN, - MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16), - false, false, 0); + MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16)); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(16, DL, PtrVT)); @@ -2640,8 +2717,8 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, /// appropriate copies out of appropriate physical registers. SDValue AArch64TargetLowering::LowerCallResult( SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals, bool isThisReturn, + const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, + SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, SDValue ThisVal) const { CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS @@ -2658,7 +2735,7 @@ SDValue AArch64TargetLowering::LowerCallResult( // Pass 'this' value directly from the argument to return value, to avoid // reg unit interference - if (i == 0 && isThisReturn) { + if (i == 0 && isThisReturn && EnableThisRetForwarding) { assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 && "unexpected return calling convention register assignment"); InVals.push_back(ThisVal); @@ -2688,7 +2765,6 @@ SDValue AArch64TargetLowering::LowerCallResult( bool AArch64TargetLowering::isEligibleForTailCallOptimization( SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, - bool isCalleeStructRet, bool isCallerStructRet, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { @@ -2698,7 +2774,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) return false; - const MachineFunction &MF = DAG.getMachineFunction(); + MachineFunction &MF = DAG.getMachineFunction(); const Function *CallerF = MF.getFunction(); CallingConv::ID CallerCC = CallerF->getCallingConv(); bool CCMatch = CallerCC == CalleeCC; @@ -2713,9 +2789,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( return false; if (getTargetMachine().Options.GuaranteedTailCallOpt) { - if (IsTailCallConvention(CalleeCC) && CCMatch) - return true; - return false; + return IsTailCallConvention(CalleeCC) && CCMatch; } // Externally-defined functions with weak linkage should not be @@ -2742,6 +2816,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( assert((!isVarArg || CalleeCC == CallingConv::C) && "Unexpected variadic calling convention"); + LLVMContext &C = *DAG.getContext(); if (isVarArg && !Outs.empty()) { // At least two cases here: if caller is fastcc then we can't have any // memory arguments (we'd be expected to clean up the stack afterwards). If @@ -2750,8 +2825,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( // FIXME: for now we take the most conservative of these in both cases: // disallow all variadic memory operands. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, - *DAG.getContext()); + CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true)); for (const CCValAssign &ArgLoc : ArgLocs) @@ -2759,34 +2833,18 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( return false; } - // If the calling conventions do not match, then we'd better make sure the - // results are returned in the same way as what the caller expects. + // Check that the call results are passed in the same way. + if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, + CCAssignFnForCall(CalleeCC, isVarArg), + CCAssignFnForCall(CallerCC, isVarArg))) + return false; + // The callee has to preserve all registers the caller needs to preserve. + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); if (!CCMatch) { - SmallVector<CCValAssign, 16> RVLocs1; - CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1, - *DAG.getContext()); - CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg)); - - SmallVector<CCValAssign, 16> RVLocs2; - CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2, - *DAG.getContext()); - CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg)); - - if (RVLocs1.size() != RVLocs2.size()) + const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); + if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) return false; - for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { - if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) - return false; - if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) - return false; - if (RVLocs1[i].isRegLoc()) { - if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) - return false; - } else { - if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) - return false; - } - } } // Nothing more to check if the callee is taking no arguments @@ -2794,16 +2852,22 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( return true; SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, - *DAG.getContext()); + CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); - // If the stack arguments for this call would fit into our own save area then - // the call can be made tail. - return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea(); + // If the stack arguments for this call do not fit into our own save area then + // the call cannot be made tail. + if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) + return false; + + const MachineRegisterInfo &MRI = MF.getRegInfo(); + if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) + return false; + + return true; } SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, @@ -2845,7 +2909,8 @@ bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, } bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const { - return CallCC == CallingConv::Fast; + return CallCC == CallingConv::Fast || + CallCC == CallingConv::PreserveMost; } /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, @@ -2865,7 +2930,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, bool IsVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); - bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); bool IsThisReturn = false; AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); @@ -2875,8 +2939,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, if (IsTailCall) { // Check if it's really possible to do a tail call. IsTailCall = isEligibleForTailCallOptimization( - Callee, CallConv, IsVarArg, IsStructRet, - MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG); + Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); @@ -2959,7 +3022,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Since callee will pop argument stack as a tail call, we must keep the // popped size 16-byte aligned. - NumBytes = RoundUpToAlignment(NumBytes, 16); + NumBytes = alignTo(NumBytes, 16); // FPDiff will be negative if this tail call requires more space than we // would automatically have in our incoming argument space. Positive if we @@ -3092,8 +3155,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, VA.getValVT() == MVT::i16) Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg); - SDValue Store = - DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0); + SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo); MemOpChains.push_back(Store); } } @@ -3199,9 +3261,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops); InFlag = Chain.getValue(1); - uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt) - ? RoundUpToAlignment(NumBytes, 16) - : 0; + uint64_t CalleePopBytes = + DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0; Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), DAG.getIntPtrConstant(CalleePopBytes, DL, true), @@ -3232,7 +3293,7 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, - SDLoc DL, SelectionDAG &DAG) const { + const SDLoc &DL, SelectionDAG &DAG) const { CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS : RetCC_AArch64_AAPCS; @@ -3318,26 +3379,6 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); } - if ((OpFlags & AArch64II::MO_CONSTPOOL) != 0) { - assert(getTargetMachine().getCodeModel() == CodeModel::Small && - "use of MO_CONSTPOOL only supported on small model"); - SDValue Hi = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, AArch64II::MO_PAGE); - SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); - unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC; - SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags); - SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); - SDValue GlobalAddr = DAG.getLoad( - PtrVT, DL, DAG.getEntryNode(), PoolAddr, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), - /*isVolatile=*/false, - /*isNonTemporal=*/true, - /*isInvariant=*/true, 8); - if (GN->getOffset() != 0) - return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr, - DAG.getConstant(GN->getOffset(), DL, PtrVT)); - return GlobalAddr; - } - if (getTargetMachine().getCodeModel() == CodeModel::Large) { const unsigned char MO_NC = AArch64II::MO_NC; return DAG.getNode( @@ -3405,8 +3446,9 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, SDValue Chain = DAG.getEntryNode(); SDValue FuncTLVGet = DAG.getLoad(MVT::i64, DL, Chain, DescAddr, - MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, - true, true, 8); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + /* Alignment = */ 8, MachineMemOperand::MONonTemporal | + MachineMemOperand::MOInvariant); Chain = FuncTLVGet.getValue(1); MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); @@ -3447,18 +3489,16 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, /// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the /// above sequence, and expanded really late in the compilation flow, to ensure /// the sequence is produced as per above. -SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL, +SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, + const SDLoc &DL, SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); - SmallVector<SDValue, 2> Ops; - Ops.push_back(Chain); - Ops.push_back(SymAddr); - - Chain = DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, Ops); + Chain = + DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr}); SDValue Glue = Chain.getValue(1); return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue); @@ -3888,7 +3928,7 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, - SDValue FVal, SDLoc dl, + SDValue FVal, const SDLoc &dl, SelectionDAG &DAG) const { // Handle f128 first, because it will result in a comparison of some RTLIB // call result against zero. @@ -4181,7 +4221,7 @@ SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, getPointerTy(DAG.getDataLayout())); const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), - MachinePointerInfo(SV), false, false, 0); + MachinePointerInfo(SV)); } SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, @@ -4201,7 +4241,7 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, // void *__stack at offset 0 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT); MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, - MachinePointerInfo(SV), false, false, 8)); + MachinePointerInfo(SV), /* Alignment = */ 8)); // void *__gr_top at offset 8 int GPRSize = FuncInfo->getVarArgsGPRSize(); @@ -4216,7 +4256,8 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, DAG.getConstant(GPRSize, DL, PtrVT)); MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, - MachinePointerInfo(SV, 8), false, false, 8)); + MachinePointerInfo(SV, 8), + /* Alignment = */ 8)); } // void *__vr_top at offset 16 @@ -4231,24 +4272,23 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, DAG.getConstant(FPRSize, DL, PtrVT)); MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, - MachinePointerInfo(SV, 16), false, false, 8)); + MachinePointerInfo(SV, 16), + /* Alignment = */ 8)); } // int __gr_offs at offset 24 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT)); - MemOps.push_back(DAG.getStore(Chain, DL, - DAG.getConstant(-GPRSize, DL, MVT::i32), - GROffsAddr, MachinePointerInfo(SV, 24), false, - false, 4)); + MemOps.push_back(DAG.getStore( + Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr, + MachinePointerInfo(SV, 24), /* Alignment = */ 4)); // int __vr_offs at offset 28 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT)); - MemOps.push_back(DAG.getStore(Chain, DL, - DAG.getConstant(-FPRSize, DL, MVT::i32), - VROffsAddr, MachinePointerInfo(SV, 28), false, - false, 4)); + MemOps.push_back(DAG.getStore( + Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr, + MachinePointerInfo(SV, 28), /* Alignment = */ 4)); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); } @@ -4287,8 +4327,7 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { unsigned Align = Op.getConstantOperandVal(3); auto PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V), - false, false, false, 0); + SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V)); Chain = VAList.getValue(1); if (Align > 8) { @@ -4318,14 +4357,14 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(ArgSize, DL, PtrVT)); // Store the incremented VAList to the legalized pointer - SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V), - false, false, 0); + SDValue APStore = + DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V)); // Load the actual argument out of the pointer VAList if (NeedFPTrunc) { // Load the value as an f64. - SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList, - MachinePointerInfo(), false, false, false, 0); + SDValue WideFP = + DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo()); // Round the value down to an f32. SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0), DAG.getIntPtrConstant(1, DL)); @@ -4334,8 +4373,7 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { return DAG.getMergeValues(Ops, DL); } - return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false, - false, false, 0); + return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo()); } SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, @@ -4350,7 +4388,7 @@ SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT); while (Depth--) FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr, - MachinePointerInfo(), false, false, false, 0); + MachinePointerInfo()); return FrameAddr; } @@ -4381,7 +4419,7 @@ SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); return DAG.getLoad(VT, DL, DAG.getEntryNode(), DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), - MachinePointerInfo(), false, false, false, 0); + MachinePointerInfo()); } // Return LR, which contains the return address. Mark it an implicit live-in. @@ -4521,6 +4559,40 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { // AArch64 Optimization Hooks //===----------------------------------------------------------------------===// +/// getEstimate - Return the appropriate estimate DAG for either the reciprocal +/// or the reciprocal square root. +static SDValue getEstimate(const AArch64Subtarget &ST, + const AArch64TargetLowering::DAGCombinerInfo &DCI, unsigned Opcode, + const SDValue &Operand, unsigned &ExtraSteps) { + if (!ST.hasNEON()) + return SDValue(); + + EVT VT = Operand.getValueType(); + + std::string RecipOp; + RecipOp = Opcode == (AArch64ISD::FRECPE) ? "div": "sqrt"; + RecipOp = ((VT.isVector()) ? "vec-": "") + RecipOp; + RecipOp += (VT.getScalarType() == MVT::f64) ? "d": "f"; + + TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; + if (!Recips.isEnabled(RecipOp)) + return SDValue(); + + ExtraSteps = Recips.getRefinementSteps(RecipOp); + return DCI.DAG.getNode(Opcode, SDLoc(Operand), VT, Operand); +} + +SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand, + DAGCombinerInfo &DCI, unsigned &ExtraSteps) const { + return getEstimate(*Subtarget, DCI, AArch64ISD::FRECPE, Operand, ExtraSteps); +} + +SDValue AArch64TargetLowering::getRsqrtEstimate(SDValue Operand, + DAGCombinerInfo &DCI, unsigned &ExtraSteps, bool &UseOneConst) const { + UseOneConst = true; + return getEstimate(*Subtarget, DCI, AArch64ISD::FRSQRTE, Operand, ExtraSteps); +} + //===----------------------------------------------------------------------===// // AArch64 Inline Assembly Support //===----------------------------------------------------------------------===// @@ -4548,6 +4620,27 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { // is prefixed by the %w modifier. Floating-point and SIMD register operands // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or // %q modifier. +const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const { + // At this point, we have to lower this constraint to something else, so we + // lower it to an "r" or "w". However, by doing this we will force the result + // to be in register, while the X constraint is much more permissive. + // + // Although we are correct (we are free to emit anything, without + // constraints), we might break use cases that would expect us to be more + // efficient and emit something else. + if (!Subtarget->hasFPARMv8()) + return "r"; + + if (ConstraintVT.isFloatingPoint()) + return "w"; + + if (ConstraintVT.isVector() && + (ConstraintVT.getSizeInBits() == 64 || + ConstraintVT.getSizeInBits() == 128)) + return "w"; + + return "r"; +} /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. @@ -4642,11 +4735,16 @@ AArch64TargetLowering::getRegForInlineAsmConstraint( int RegNo; bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo); if (!Failed && RegNo >= 0 && RegNo <= 31) { - // v0 - v31 are aliases of q0 - q31. + // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size. // By default we'll emit v0-v31 for this unless there's a modifier where // we'll emit the correct register as well. - Res.first = AArch64::FPR128RegClass.getRegister(RegNo); - Res.second = &AArch64::FPR128RegClass; + if (VT != MVT::Other && VT.getSizeInBits() == 64) { + Res.first = AArch64::FPR64RegClass.getRegister(RegNo); + Res.second = &AArch64::FPR64RegClass; + } else { + Res.first = AArch64::FPR128RegClass.getRegister(RegNo); + Res.second = &AArch64::FPR128RegClass; + } } } } @@ -4862,11 +4960,12 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, SmallVector<ShuffleSourceInfo, 2> Sources; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); - if (V.getOpcode() == ISD::UNDEF) + if (V.isUndef()) continue; - else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { + else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isa<ConstantSDNode>(V.getOperand(1))) { // A shuffle can only come from building a vector from various - // elements of other vectors. + // elements of other vectors, provided their indices are constant. return SDValue(); } @@ -4985,7 +5084,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits(); for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { SDValue Entry = Op.getOperand(i); - if (Entry.getOpcode() == ISD::UNDEF) + if (Entry.isUndef()) continue; auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0)); @@ -5018,7 +5117,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, ShuffleOps[i] = Sources[i].ShuffleVec; SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], - ShuffleOps[1], &Mask[0]); + ShuffleOps[1], Mask); return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); } @@ -5304,7 +5403,7 @@ static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { /// the specified operations to build the shuffle. static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, - SDLoc dl) { + const SDLoc &dl) { unsigned OpNum = (PFEntry >> 26) & 0x0F; unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1); unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1); @@ -5433,35 +5532,34 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2); SDValue Shuffle; - if (V2.getNode()->getOpcode() == ISD::UNDEF) { + if (V2.getNode()->isUndef()) { if (IndexLen == 8) V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, - DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, - makeArrayRef(TBLMask.data(), IndexLen))); + DAG.getBuildVector(IndexVT, DL, + makeArrayRef(TBLMask.data(), IndexLen))); } else { if (IndexLen == 8) { V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, - DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, - makeArrayRef(TBLMask.data(), IndexLen))); + DAG.getBuildVector(IndexVT, DL, + makeArrayRef(TBLMask.data(), IndexLen))); } else { // FIXME: We cannot, for the moment, emit a TBL2 instruction because we // cannot currently represent the register constraints on the input // table registers. // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst, - // DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, - // &TBLMask[0], IndexLen)); + // DAG.getBuildVector(IndexVT, DL, &TBLMask[0], + // IndexLen)); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, - DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), - V1Cst, V2Cst, - DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, - makeArrayRef(TBLMask.data(), IndexLen))); + DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst, + V2Cst, DAG.getBuildVector(IndexVT, DL, + makeArrayRef(TBLMask.data(), IndexLen))); } } return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); @@ -5496,8 +5594,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); - if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], - V1.getValueType().getSimpleVT())) { + if (SVN->isSplat()) { int Lane = SVN->getSplatIndex(); // If this is undef splat, generate it via "just" vdup, if possible. if (Lane == -1) @@ -5546,8 +5643,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, Imm *= getExtFactor(V1); return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2, DAG.getConstant(Imm, dl, MVT::i32)); - } else if (V2->getOpcode() == ISD::UNDEF && - isSingletonEXTMask(ShuffleMask, VT, Imm)) { + } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) { Imm *= getExtFactor(V1); return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1, DAG.getConstant(Imm, dl, MVT::i32)); @@ -5580,8 +5676,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); } - SDValue Concat = tryFormConcatFromShuffle(Op, DAG); - if (Concat.getNode()) + if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG)) return Concat; bool DstIsLeft; @@ -5853,8 +5948,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SelectionDAG &DAG) const { // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) if (EnableAArch64SlrGeneration) { - SDValue Res = tryLowerToSLI(Op.getNode(), DAG); - if (Res.getNode()) + if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG)) return Res; } @@ -5972,7 +6066,7 @@ static SDValue NormalizeBuildVector(SDValue Op, } Ops.push_back(Lane); } - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); + return DAG.getBuildVector(VT, dl, Ops); } SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, @@ -6217,7 +6311,7 @@ FailedModImm: SDValue ConstantValue; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); - if (V.getOpcode() == ISD::UNDEF) + if (V.isUndef()) continue; if (i > 0) isOnlyLowElement = false; @@ -6273,7 +6367,7 @@ FailedModImm: for (unsigned i = 0; i < NumElts; ++i) Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i))); EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts); - SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops); + SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); Val = LowerBUILD_VECTOR(Val, DAG); if (Val.getNode()) return DAG.getNode(ISD::BITCAST, dl, VT, Val); @@ -6328,7 +6422,7 @@ FailedModImm: // value is already in an S or D register. // Do not do this for UNDEF/LOAD nodes because we have better patterns // for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR. - if (Op0.getOpcode() != ISD::UNDEF && Op0.getOpcode() != ISD::LOAD && + if (!Op0.isUndef() && Op0.getOpcode() != ISD::LOAD && (ElemSize == 32 || ElemSize == 64)) { unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub; MachineSDNode *N = @@ -6339,7 +6433,7 @@ FailedModImm: } for (; i < NumElts; ++i) { SDValue V = Op.getOperand(i); - if (V.getOpcode() == ISD::UNDEF) + if (V.isUndef()) continue; SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); @@ -6580,7 +6674,7 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, - SDLoc dl, SelectionDAG &DAG) { + const SDLoc &dl, SelectionDAG &DAG) { EVT SrcVT = LHS.getValueType(); assert(VT.getSizeInBits() == SrcVT.getSizeInBits() && "function only supposed to emit natural comparisons"); @@ -6877,12 +6971,10 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { const DataLayout &DL = I->getModule()->getDataLayout(); EVT VT = getValueType(DL, User->getOperand(0)->getType()); - if (isFMAFasterThanFMulAndFAdd(VT) && - isOperationLegalOrCustom(ISD::FMA, VT) && - (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath)) - return false; - - return true; + return !(isFMAFasterThanFMulAndFAdd(VT) && + isOperationLegalOrCustom(ISD::FMA, VT) && + (Options.AllowFPOpFusion == FPOpFusion::Fast || + Options.UnsafeFPMath)); } // All 32-bit GPR operations implicitly zero the high-half of the corresponding @@ -7183,16 +7275,17 @@ EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, // 12-bit optionally shifted immediates are legal for adds. bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const { - if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0)) - return true; - return false; + // Avoid UB for INT64_MIN. + if (Immed == std::numeric_limits<int64_t>::min()) + return false; + // Same encoding for add/sub, just flip the sign. + Immed = std::abs(Immed); + return ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0)); } // Integer comparisons are implemented with ADDS/SUBS, so the range of valid // immediates is the same as for an add or a sub. bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const { - if (Immed < 0) - Immed *= -1; return isLegalAddImmediate(Immed); } @@ -7244,10 +7337,8 @@ bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL, // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2 - if (!AM.Scale || AM.Scale == 1 || - (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes)) - return true; - return false; + return !AM.Scale || AM.Scale == 1 || + (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes); } int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL, @@ -7334,6 +7425,33 @@ bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, return Shift < 3; } +/// Turn vector tests of the signbit in the form of: +/// xor (sra X, elt_size(X)-1), -1 +/// into: +/// cmge X, X, #0 +static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + if (!Subtarget->hasNEON() || !VT.isVector()) + return SDValue(); + + // There must be a shift right algebraic before the xor, and the xor must be a + // 'not' operation. + SDValue Shift = N->getOperand(0); + SDValue Ones = N->getOperand(1); + if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() || + !ISD::isBuildVectorAllOnes(Ones.getNode())) + return SDValue(); + + // The shift should be smearing the sign bit across each vector element. + auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); + EVT ShiftEltTy = Shift.getValueType().getVectorElementType(); + if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1) + return SDValue(); + + return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0)); +} + // Generate SUBS and CSEL for integer abs. static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); @@ -7362,13 +7480,15 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -// performXorCombine - Attempts to handle integer ABS. static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); + if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget)) + return Cmp; + return performIntegerAbsCombine(N, DAG); } @@ -7376,6 +7496,10 @@ SDValue AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, std::vector<SDNode *> *Created) const { + AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); + if (isIntDivCheap(N->getValueType(0), Attr)) + return SDValue(N,0); // Lower SDIV as SDIV + // fold (sdiv X, pow2) EVT VT = N->getValueType(0); if ((VT != MVT::i32 && VT != MVT::i64) || @@ -7426,7 +7550,7 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and // 64-bit is 5 cycles, so this is always a win. if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) { - APInt Value = C->getAPIntValue(); + const APInt &Value = C->getAPIntValue(); EVT VT = N->getValueType(0); SDLoc DL(N); if (Value.isNonNegative()) { @@ -7543,9 +7667,8 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, !cast<LoadSDNode>(N0)->isVolatile()) { LoadSDNode *LN0 = cast<LoadSDNode>(N0); SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), - LN0->getPointerInfo(), LN0->isVolatile(), - LN0->isNonTemporal(), LN0->isInvariant(), - LN0->getAlignment()); + LN0->getPointerInfo(), LN0->getAlignment(), + LN0->getMemOperand()->getFlags()); // Make sure successors of the original load stay after it by updating them // to use the new Chain. @@ -7567,7 +7690,8 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); SDValue Op = N->getOperand(0); - if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL) + if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || + Op.getOpcode() != ISD::FMUL) return SDValue(); SDValue ConstVec = Op->getOperand(1); @@ -7801,25 +7925,49 @@ static SDValue tryCombineToBSL(SDNode *N, static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) - if (!EnableAArch64ExtrGeneration) - return SDValue(); SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); - SDValue Res = tryCombineToEXTR(N, DCI); - if (Res.getNode()) + if (SDValue Res = tryCombineToEXTR(N, DCI)) return Res; - Res = tryCombineToBSL(N, DCI); - if (Res.getNode()) + if (SDValue Res = tryCombineToBSL(N, DCI)) return Res; return SDValue(); } +static SDValue performSRLCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + if (VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + + // Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the + // high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32) + // to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero. + SDValue N0 = N->getOperand(0); + if (N0.getOpcode() == ISD::BSWAP) { + SDLoc DL(N); + SDValue N1 = N->getOperand(1); + SDValue N00 = N0.getOperand(0); + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { + uint64_t ShiftAmt = C->getZExtValue(); + if (VT == MVT::i32 && ShiftAmt == 16 && + DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16))) + return DAG.getNode(ISD::ROTR, DL, VT, N0, N1); + if (VT == MVT::i64 && ShiftAmt == 32 && + DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32))) + return DAG.getNode(ISD::ROTR, DL, VT, N0, N1); + } + } + return SDValue(); +} + static SDValue performBitcastCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { @@ -8575,15 +8723,15 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) { SDValue BasePtr = St->getBasePtr(); SDValue NewST1 = DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(), - St->isVolatile(), St->isNonTemporal(), St->getAlignment()); + St->getAlignment(), St->getMemOperand()->getFlags()); unsigned Offset = EltOffset; while (--NumVecElts) { SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, DAG.getConstant(Offset, DL, MVT::i64)); NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, - St->getPointerInfo(), St->isVolatile(), - St->isNonTemporal(), Alignment); + St->getPointerInfo(), Alignment, + St->getMemOperand()->getFlags()); Offset += EltOffset; } return NewST1; @@ -8603,9 +8751,7 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be // a call to that function here. - // Cyclone has bad performance on unaligned 16B stores when crossing line and - // page boundaries. We want to split such stores. - if (!Subtarget->isCyclone()) + if (!Subtarget->isMisaligned128StoreSlow()) return SDValue(); // Don't split at -Oz. @@ -8647,12 +8793,12 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SDValue BasePtr = S->getBasePtr(); SDValue NewST1 = DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(), - S->isVolatile(), S->isNonTemporal(), S->getAlignment()); + S->getAlignment(), S->getMemOperand()->getFlags()); SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, DAG.getConstant(8, DL, MVT::i64)); return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr, - S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(), - S->getAlignment()); + S->getPointerInfo(), S->getAlignment(), + S->getMemOperand()->getFlags()); } /// Target-specific DAG combine function for post-increment LD1 (lane) and @@ -8741,9 +8887,10 @@ static SDValue performPostLD1Combine(SDNode *N, LoadSDN->getMemOperand()); // Update the uses. - SmallVector<SDValue, 2> NewResults; - NewResults.push_back(SDValue(LD, 0)); // The result of load - NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain + SDValue NewResults[] = { + SDValue(LD, 0), // The result of load + SDValue(UpdN.getNode(), 2) // Chain + }; DCI.CombineTo(LD, NewResults); DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register @@ -8774,8 +8921,7 @@ static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { - SDValue Split = split16BStores(N, DCI, DAG, Subtarget); - if (Split.getNode()) + if (SDValue Split = split16BStores(N, DCI, DAG, Subtarget)) return Split; if (Subtarget->supportsAddressTopByteIgnored() && @@ -9215,10 +9361,8 @@ bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) { } case ISD::Constant: case ISD::TargetConstant: { - if (std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) < - 1LL << (width - 1)) - return true; - return false; + return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) < + 1LL << (width - 1); } } @@ -9286,14 +9430,13 @@ bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) { // isEquivalentMaskless() is the code for testing if the AND can be removed // factored out of the DAG recognition as the DAG can take several forms. -static -bool isEquivalentMaskless(unsigned CC, unsigned width, - ISD::LoadExtType ExtType, signed AddConstant, - signed CompConstant) { +static bool isEquivalentMaskless(unsigned CC, unsigned width, + ISD::LoadExtType ExtType, int AddConstant, + int CompConstant) { // By being careful about our equations and only writing the in term // symbolic values and well known constants (0, 1, -1, MaxUInt) we can // make them generally applicable to all bit widths. - signed MaxUInt = (1 << width); + int MaxUInt = (1 << width); // For the purposes of these comparisons sign extending the type is // equivalent to zero extending the add and displacing it by half the integer @@ -9441,8 +9584,7 @@ SDValue performCONDCombine(SDNode *N, static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { - SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3); - if (NV.getNode()) + if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3)) N = NV.getNode(); SDValue Chain = N->getOperand(0); SDValue Dest = N->getOperand(1); @@ -9678,7 +9820,7 @@ static SDValue performSelectCombine(SDNode *N, // Now duplicate the comparison mask we want across all other lanes. SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0); - SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data()); + SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask); Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(), Mask); @@ -9716,6 +9858,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performFDivCombine(N, DAG, Subtarget); case ISD::OR: return performORCombine(N, DCI, Subtarget); + case ISD::SRL: + return performSRLCombine(N, DCI); case ISD::INTRINSIC_WO_CHAIN: return performIntrinsicCombine(N, DCI, Subtarget); case ISD::ANY_EXTEND: @@ -9829,10 +9973,7 @@ bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N, // return instructions to help enable tail call optimizations for this // instruction. bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { - if (!CI->isTailCall()) - return false; - - return true; + return CI->isTailCall(); } bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base, @@ -9935,6 +10076,31 @@ static void ReplaceReductionResults(SDNode *N, Results.push_back(SplitVal); } +static void ReplaceCMP_SWAP_128Results(SDNode *N, + SmallVectorImpl<SDValue> & Results, + SelectionDAG &DAG) { + assert(N->getValueType(0) == MVT::i128 && + "AtomicCmpSwap on types less than 128 should be legal"); + SDValue Ops[] = {N->getOperand(1), + N->getOperand(2)->getOperand(0), + N->getOperand(2)->getOperand(1), + N->getOperand(3)->getOperand(0), + N->getOperand(3)->getOperand(1), + N->getOperand(0)}; + SDNode *CmpSwap = DAG.getMachineNode( + AArch64::CMP_SWAP_128, SDLoc(N), + DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops); + + MachineFunction &MF = DAG.getMachineFunction(); + MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1); + MemOp[0] = cast<MemSDNode>(N)->getMemOperand(); + cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1); + + Results.push_back(SDValue(CmpSwap, 0)); + Results.push_back(SDValue(CmpSwap, 1)); + Results.push_back(SDValue(CmpSwap, 3)); +} + void AArch64TargetLowering::ReplaceNodeResults( SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { switch (N->getOpcode()) { @@ -9966,11 +10132,16 @@ void AArch64TargetLowering::ReplaceNodeResults( assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion"); // Let normal code take care of it by not adding anything to Results. return; + case ISD::ATOMIC_CMP_SWAP: + ReplaceCMP_SWAP_128Results(N, Results, DAG); + return; } } bool AArch64TargetLowering::useLoadStackGuardNode() const { - return true; + if (!Subtarget->isTargetAndroid()) + return true; + return TargetLowering::useLoadStackGuardNode(); } unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const { @@ -10017,14 +10188,19 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( AtomicCmpXchgInst *AI) const { - return true; + // At -O0, fast-regalloc cannot cope with the live vregs necessary to + // implement cmpxchg without spilling. If the address being exchanged is also + // on the stack and close enough to the spill slot, this can lead to a + // situation where the monitor always gets cleared and the atomic operation + // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. + return getTargetMachine().getOptLevel() != 0; } Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Type *ValTy = cast<PointerType>(Addr->getType())->getElementType(); - bool IsAcquire = isAtLeastAcquire(Ord); + bool IsAcquire = isAcquireOrStronger(Ord); // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd // intrinsic must return {i64, i64} and we have to recombine them into a @@ -10066,7 +10242,7 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - bool IsRelease = isAtLeastRelease(Ord); + bool IsRelease = isReleaseOrStronger(Ord); // Since the intrinsics must have legal type, the i128 intrinsics take two // parameters: "i64, i64". We must marshal Val into the appropriate form @@ -10104,6 +10280,22 @@ bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, return false; } +Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { + if (!Subtarget->isTargetAndroid()) + return TargetLowering::getIRStackGuard(IRB); + + // Android provides a fixed TLS slot for the stack cookie. See the definition + // of TLS_SLOT_STACK_GUARD in + // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h + const unsigned TlsOffset = 0x28; + Module *M = IRB.GetInsertBlock()->getParent()->getParent(); + Function *ThreadPointerFunc = + Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); + return IRB.CreatePointerCast( + IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset), + Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0)); +} + Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { if (!Subtarget->isTargetAndroid()) return TargetLowering::getSafeStackPointerLocation(IRB); @@ -10114,7 +10306,7 @@ Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) cons const unsigned TlsOffset = 0x48; Module *M = IRB.GetInsertBlock()->getParent()->getParent(); Function *ThreadPointerFunc = - Intrinsic::getDeclaration(M, Intrinsic::aarch64_thread_pointer); + Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); return IRB.CreatePointerCast( IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset), Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0)); @@ -10166,3 +10358,16 @@ void AArch64TargetLowering::insertCopiesSplitCSR( .addReg(NewVR); } } + +bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { + // Integer division on AArch64 is expensive. However, when aggressively + // optimizing for code size, we prefer to use a div instruction, as it is + // usually smaller than the alternative sequence. + // The exception to this is vector division. Since AArch64 doesn't have vector + // integer division, leaving the division as-is is a loss even in terms of + // size, because it will have to be scalarized, while the alternative code + // sequence can be performed in vector form. + bool OptSize = + Attr.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); + return OptSize && !VT.isVector(); +} diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index e99616c94068f..c87cfed1f892b 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -187,6 +187,10 @@ enum NodeType : unsigned { SMULL, UMULL, + // Reciprocal estimates. + FRECPE, + FRSQRTE, + // NEON Load/Store with post-increment base updates LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE, LD3post, @@ -272,11 +276,11 @@ public: SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const; - MachineBasicBlock *EmitF128CSEL(MachineInstr *MI, + MachineBasicBlock *EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock * - EmitInstrWithCustomInserter(MachineInstr *MI, + EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, @@ -358,6 +362,10 @@ public: TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; + /// If the target has a standard location for the stack protector cookie, + /// returns the address of that location. Otherwise, returns nullptr. + Value *getIRStackGuard(IRBuilder<> &IRB) const override; + /// If the target has a standard location for the unsafe stack pointer, /// returns the address of that location. Otherwise, returns nullptr. Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override; @@ -378,6 +386,8 @@ public: return AArch64::X1; } + bool isIntDivCheap(EVT VT, AttributeSet Attr) const override; + bool isCheapToSpeculateCttz() const override { return true; } @@ -385,6 +395,12 @@ public: bool isCheapToSpeculateCtlz() const override { return true; } + + bool hasBitPreservingFPLogic(EVT VT) const override { + // FIXME: Is this always true? It should be true for vectors at least. + return VT == MVT::f32 || VT == MVT::f64; + } + bool supportSplitCSR(MachineFunction *MF) const override { return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && MF->getFunction()->hasFnAttribute(Attribute::NoUnwind); @@ -394,6 +410,10 @@ public: MachineBasicBlock *Entry, const SmallVectorImpl<MachineBasicBlock *> &Exits) const override; + bool supportSwiftError() const override { + return true; + } + private: bool isExtFreeImpl(const Instruction *Ext) const override; @@ -401,30 +421,30 @@ private: /// make the right decision when generating code for different targets. const AArch64Subtarget *Subtarget; - void addTypeForNEON(EVT VT, EVT PromotedBitwiseVT); + void addTypeForNEON(MVT VT, MVT PromotedBitwiseVT); void addDRTypeForNEON(MVT VT); void addQRTypeForNEON(MVT VT); - SDValue - LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, - SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) const override; + SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + const SDLoc &DL, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const override; SDValue LowerCall(CallLoweringInfo & /*CLI*/, SmallVectorImpl<SDValue> &InVals) const override; SDValue LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, - SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, - bool isThisReturn, SDValue ThisVal) const; + const SmallVectorImpl<ISD::InputArg> &Ins, + const SDLoc &DL, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals, bool isThisReturn, + SDValue ThisVal) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; bool isEligibleForTailCallOptimization( SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, - bool isCalleeStructRet, bool isCallerStructRet, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const; @@ -439,7 +459,7 @@ private: bool IsTailCallConvention(CallingConv::ID CallCC) const; - void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL, + void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &DL, SDValue &Chain) const; bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, @@ -449,21 +469,21 @@ private: SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, SDLoc DL, + const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override; SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL, + SDValue LowerELFTLSDescCallSeq(SDValue SymAddr, const SDLoc &DL, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS, - SDValue TVal, SDValue FVal, SDLoc dl, + SDValue TVal, SDValue FVal, const SDLoc &dl, SelectionDAG &DAG) const; SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; @@ -500,6 +520,11 @@ private: SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, std::vector<SDNode *> *Created) const override; + SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const override; + SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const override; unsigned combineRepeatedFPDivisors() const override; ConstraintType getConstraintType(StringRef Constraint) const override; @@ -515,6 +540,9 @@ private: std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; + + const char *LowerXConstraint(EVT ConstraintVT) const override; + void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, SelectionDAG &DAG) const override; diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td index 4923a1161dfcf..59de62ad28771 100644 --- a/lib/Target/AArch64/AArch64InstrAtomics.td +++ b/lib/Target/AArch64/AArch64InstrAtomics.td @@ -29,7 +29,7 @@ def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>; class acquiring_load<PatFrag base> : PatFrag<(ops node:$ptr), (base node:$ptr), [{ AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering(); - return isAtLeastAcquire(Ordering); + return isAcquireOrStronger(Ordering); }]>; // An atomic load operation that does not need either acquire or release @@ -37,7 +37,7 @@ class acquiring_load<PatFrag base> class relaxed_load<PatFrag base> : PatFrag<(ops node:$ptr), (base node:$ptr), [{ AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering(); - return !isAtLeastAcquire(Ordering); + return !isAcquireOrStronger(Ordering); }]>; // 8-bit loads @@ -112,15 +112,16 @@ def : Pat<(relaxed_load<atomic_load_64> class releasing_store<PatFrag base> : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{ AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering(); - assert(Ordering != AcquireRelease && "unexpected store ordering"); - return isAtLeastRelease(Ordering); + assert(Ordering != AtomicOrdering::AcquireRelease && + "unexpected store ordering"); + return isReleaseOrStronger(Ordering); }]>; // An atomic store operation that doesn't actually need to be atomic on AArch64. class relaxed_store<PatFrag base> : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{ AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering(); - return !isAtLeastRelease(Ordering); + return !isReleaseOrStronger(Ordering); }]>; // 8-bit stores @@ -361,3 +362,43 @@ def : Pat<(stlxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr), // And clear exclusive. def : Pat<(int_aarch64_clrex), (CLREX 0xf)>; + +//===---------------------------------- +// Atomic cmpxchg for -O0 +//===---------------------------------- + +// The fast register allocator used during -O0 inserts spills to cover any VRegs +// live across basic block boundaries. When this happens between an LDXR and an +// STXR it can clear the exclusive monitor, causing all cmpxchg attempts to +// fail. + +// Unfortunately, this means we have to have an alternative (expanded +// post-regalloc) path for -O0 compilations. Fortunately this path can be +// significantly more naive than the standard expansion: we conservatively +// assume seq_cst, strong cmpxchg and omit clrex on failure. + +let Constraints = "@earlyclobber $Rd,@earlyclobber $status", + mayLoad = 1, mayStore = 1 in { +def CMP_SWAP_8 : Pseudo<(outs GPR32:$Rd, GPR32:$status), + (ins GPR64:$addr, GPR32:$desired, GPR32:$new), []>, + Sched<[WriteAtomic]>; + +def CMP_SWAP_16 : Pseudo<(outs GPR32:$Rd, GPR32:$status), + (ins GPR64:$addr, GPR32:$desired, GPR32:$new), []>, + Sched<[WriteAtomic]>; + +def CMP_SWAP_32 : Pseudo<(outs GPR32:$Rd, GPR32:$status), + (ins GPR64:$addr, GPR32:$desired, GPR32:$new), []>, + Sched<[WriteAtomic]>; + +def CMP_SWAP_64 : Pseudo<(outs GPR64:$Rd, GPR32:$status), + (ins GPR64:$addr, GPR64:$desired, GPR64:$new), []>, + Sched<[WriteAtomic]>; +} + +let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi,@earlyclobber $status", + mayLoad = 1, mayStore = 1 in +def CMP_SWAP_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32:$status), + (ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi, + GPR64:$newLo, GPR64:$newHi), []>, + Sched<[WriteAtomic]>; diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td index 6ac2175e50355..34d35e961210e 100644 --- a/lib/Target/AArch64/AArch64InstrFormats.td +++ b/lib/Target/AArch64/AArch64InstrFormats.td @@ -496,7 +496,7 @@ def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{ return ((uint32_t)Imm) < 65536; }]> { let ParserMatchClass = Imm0_65535Operand; - let PrintMethod = "printHexImm"; + let PrintMethod = "printImmHex"; } // imm0_255 predicate - True if the immediate is in the range [0,255]. @@ -505,7 +505,7 @@ def imm0_255 : Operand<i32>, ImmLeaf<i32, [{ return ((uint32_t)Imm) < 256; }]> { let ParserMatchClass = Imm0_255Operand; - let PrintMethod = "printHexImm"; + let PrintMethod = "printImm"; } // imm0_127 predicate - True if the immediate is in the range [0,127] @@ -514,7 +514,7 @@ def imm0_127 : Operand<i32>, ImmLeaf<i32, [{ return ((uint32_t)Imm) < 128; }]> { let ParserMatchClass = Imm0_127Operand; - let PrintMethod = "printHexImm"; + let PrintMethod = "printImm"; } // NOTE: These imm0_N operands have to be of type i64 because i64 is the size @@ -923,10 +923,7 @@ def psbhint_op : Operand<i32> { // "psb" is an alias to "hint" only for certain values of CRm:Op2 fields. if (!MCOp.isImm()) return false; - bool ValidNamed; - (void)AArch64PSBHint::PSBHintMapper().toString(MCOp.getImm(), - STI.getFeatureBits(), ValidNamed); - return ValidNamed; + return AArch64PSBHint::lookupPSBByEncoding(MCOp.getImm()) != nullptr; }]; } @@ -1549,7 +1546,7 @@ class ADRI<bit page, string asm, Operand adr, list<dag> pattern> def movimm32_imm : Operand<i32> { let ParserMatchClass = Imm0_65535Operand; let EncoderMethod = "getMoveWideImmOpValue"; - let PrintMethod = "printHexImm"; + let PrintMethod = "printImm"; } def movimm32_shift : Operand<i32> { let PrintMethod = "printShifter"; @@ -9377,7 +9374,8 @@ class BaseCASEncoding<dag oops, dag iops, string asm, string operands, class BaseCAS<string order, string size, RegisterClass RC> : BaseCASEncoding<(outs RC:$out),(ins RC:$Rs, RC:$Rt, GPR64sp:$Rn), "cas" # order # size, "\t$Rs, $Rt, [$Rn]", - "$out = $Rs",[]> { + "$out = $Rs",[]>, + Sched<[WriteAtomic]> { let NP = 1; } @@ -9391,7 +9389,8 @@ multiclass CompareAndSwap<bits<1> Acq, bits<1> Rel, string order> { class BaseCASP<string order, string size, RegisterOperand RC> : BaseCASEncoding<(outs RC:$out),(ins RC:$Rs, RC:$Rt, GPR64sp:$Rn), "casp" # order # size, "\t$Rs, $Rt, [$Rn]", - "$out = $Rs",[]> { + "$out = $Rs",[]>, + Sched<[WriteAtomic]> { let NP = 0; } @@ -9405,7 +9404,8 @@ multiclass CompareAndSwapPair<bits<1> Acq, bits<1> Rel, string order> { let Predicates = [HasV8_1a] in class BaseSWP<string order, string size, RegisterClass RC> : I<(outs RC:$Rt),(ins RC:$Rs, GPR64sp:$Rn), "swp" # order # size, - "\t$Rs, $Rt, [$Rn]","",[]> { + "\t$Rs, $Rt, [$Rn]","",[]>, + Sched<[WriteAtomic]> { bits<2> Sz; bit Acq; bit Rel; @@ -9436,7 +9436,8 @@ multiclass Swap<bits<1> Acq, bits<1> Rel, string order> { let Predicates = [HasV8_1a], mayLoad = 1, mayStore = 1, hasSideEffects = 1 in class BaseLDOPregister<string op, string order, string size, RegisterClass RC> : I<(outs RC:$Rt),(ins RC:$Rs, GPR64sp:$Rn), "ld" # op # order # size, - "\t$Rs, $Rt, [$Rn]","",[]> { + "\t$Rs, $Rt, [$Rn]","",[]>, + Sched<[WriteAtomic]> { bits<2> Sz; bit Acq; bit Rel; diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index f398117de953b..0aa4708f35ac4 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -22,27 +22,31 @@ #include "llvm/MC/MCInst.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TargetRegistry.h" +#include <algorithm> using namespace llvm; #define GET_INSTRINFO_CTOR_DTOR #include "AArch64GenInstrInfo.inc" +static LLVM_CONSTEXPR MachineMemOperand::Flags MOSuppressPair = + MachineMemOperand::MOTargetFlag1; + AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP), RI(STI.getTargetTriple()), Subtarget(STI) {} /// GetInstSize - Return the number of bytes of code the specified /// instruction may be. This returns the maximum number of bytes. -unsigned AArch64InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { - const MachineBasicBlock &MBB = *MI->getParent(); +unsigned AArch64InstrInfo::GetInstSizeInBytes(const MachineInstr &MI) const { + const MachineBasicBlock &MBB = *MI.getParent(); const MachineFunction *MF = MBB.getParent(); const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); - if (MI->getOpcode() == AArch64::INLINEASM) - return getInlineAsmLength(MI->getOperand(0).getSymbolName(), *MAI); + if (MI.getOpcode() == AArch64::INLINEASM) + return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI); - const MCInstrDesc &Desc = MI->getDesc(); + const MCInstrDesc &Desc = MI.getDesc(); switch (Desc.getOpcode()) { default: // Anything not explicitly designated otherwise is a nomal 4-byte insn. @@ -89,25 +93,25 @@ static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, } // Branch analysis. -bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, - MachineBasicBlock *&TBB, - MachineBasicBlock *&FBB, - SmallVectorImpl<MachineOperand> &Cond, - bool AllowModify) const { +bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { // If the block has no terminators, it just falls into the block after it. MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); if (I == MBB.end()) return false; - if (!isUnpredicatedTerminator(I)) + if (!isUnpredicatedTerminator(*I)) return false; // Get the last instruction in the block. - MachineInstr *LastInst = I; + MachineInstr *LastInst = &*I; // If there is only one terminator instruction, process it. unsigned LastOpc = LastInst->getOpcode(); - if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { + if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { if (isUncondBranchOpcode(LastOpc)) { TBB = LastInst->getOperand(0).getMBB(); return false; @@ -121,7 +125,7 @@ bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, } // Get the instruction before it if it is a terminator. - MachineInstr *SecondLastInst = I; + MachineInstr *SecondLastInst = &*I; unsigned SecondLastOpc = SecondLastInst->getOpcode(); // If AllowModify is true and the block ends with two or more unconditional @@ -131,19 +135,19 @@ bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, LastInst->eraseFromParent(); LastInst = SecondLastInst; LastOpc = LastInst->getOpcode(); - if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { + if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { // Return now the only terminator is an unconditional branch. TBB = LastInst->getOperand(0).getMBB(); return false; } else { - SecondLastInst = I; + SecondLastInst = &*I; SecondLastOpc = SecondLastInst->getOpcode(); } } } // If there are three terminators, we don't know what sort of block this is. - if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I)) + if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I)) return true; // If the block ends with a B and a Bcc, handle it. @@ -243,7 +247,7 @@ unsigned AArch64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { } void AArch64InstrInfo::instantiateCondBranch( - MachineBasicBlock &MBB, DebugLoc DL, MachineBasicBlock *TBB, + MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB, ArrayRef<MachineOperand> Cond) const { if (Cond[0].getImm() != -1) { // Regular Bcc @@ -259,9 +263,11 @@ void AArch64InstrInfo::instantiateCondBranch( } } -unsigned AArch64InstrInfo::InsertBranch( - MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, - ArrayRef<MachineOperand> Cond, DebugLoc DL) const { +unsigned AArch64InstrInfo::InsertBranch(MachineBasicBlock &MBB, + MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + ArrayRef<MachineOperand> Cond, + const DebugLoc &DL) const { // Shouldn't be a fall through. assert(TBB && "InsertBranch must not be told to insert a fallthrough"); @@ -399,8 +405,8 @@ bool AArch64InstrInfo::canInsertSelect( } void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, DebugLoc DL, - unsigned DstReg, + MachineBasicBlock::iterator I, + const DebugLoc &DL, unsigned DstReg, ArrayRef<MachineOperand> Cond, unsigned TrueReg, unsigned FalseReg) const { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -533,8 +539,8 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, } /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx. -static bool canBeExpandedToORR(const MachineInstr *MI, unsigned BitSize) { - uint64_t Imm = MI->getOperand(1).getImm(); +static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) { + uint64_t Imm = MI.getOperand(1).getImm(); uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); uint64_t Encoding; return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding); @@ -542,11 +548,13 @@ static bool canBeExpandedToORR(const MachineInstr *MI, unsigned BitSize) { // FIXME: this implementation should be micro-architecture dependent, so a // micro-architecture target hook should be introduced here in future. -bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const { - if (!Subtarget.isCortexA57() && !Subtarget.isCortexA53()) - return MI->isAsCheapAsAMove(); +bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { + if (!Subtarget.hasCustomCheapAsMoveHandling()) + return MI.isAsCheapAsAMove(); + + unsigned Imm; - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: return false; @@ -555,7 +563,17 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const { case AArch64::ADDXri: case AArch64::SUBWri: case AArch64::SUBXri: - return (MI->getOperand(3).getImm() == 0); + return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 || + MI.getOperand(3).getImm() == 0); + + // add/sub on register with shift + case AArch64::ADDWrs: + case AArch64::ADDXrs: + case AArch64::SUBWrs: + case AArch64::SUBXrs: + Imm = MI.getOperand(3).getImm(); + return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 && + AArch64_AM::getArithShiftValue(Imm) < 4); // logical ops on immediate case AArch64::ANDWri: @@ -580,12 +598,41 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const { case AArch64::ORRWrr: case AArch64::ORRXrr: return true; + + // logical ops on register with shift + case AArch64::ANDWrs: + case AArch64::ANDXrs: + case AArch64::BICWrs: + case AArch64::BICXrs: + case AArch64::EONWrs: + case AArch64::EONXrs: + case AArch64::EORWrs: + case AArch64::EORXrs: + case AArch64::ORNWrs: + case AArch64::ORNXrs: + case AArch64::ORRWrs: + case AArch64::ORRXrs: + Imm = MI.getOperand(3).getImm(); + return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 && + AArch64_AM::getShiftValue(Imm) < 4 && + AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL); + // If MOVi32imm or MOVi64imm can be expanded into ORRWri or // ORRXri, it is as cheap as MOV case AArch64::MOVi32imm: return canBeExpandedToORR(MI, 32); case AArch64::MOVi64imm: return canBeExpandedToORR(MI, 64); + + // It is cheap to zero out registers if the subtarget has ZeroCycleZeroing + // feature. + case AArch64::FMOVS0: + case AArch64::FMOVD0: + return Subtarget.hasZeroCycleZeroing(); + case TargetOpcode::COPY: + return (Subtarget.hasZeroCycleZeroing() && + (MI.getOperand(1).getReg() == AArch64::WZR || + MI.getOperand(1).getReg() == AArch64::XZR)); } llvm_unreachable("Unknown opcode to check as cheap as a move!"); @@ -611,20 +658,18 @@ bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, } } -bool -AArch64InstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, - MachineInstr *MIb, - AliasAnalysis *AA) const { +bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint( + MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA) const { const TargetRegisterInfo *TRI = &getRegisterInfo(); unsigned BaseRegA = 0, BaseRegB = 0; - int OffsetA = 0, OffsetB = 0; - int WidthA = 0, WidthB = 0; + int64_t OffsetA = 0, OffsetB = 0; + unsigned WidthA = 0, WidthB = 0; - assert(MIa && MIa->mayLoadOrStore() && "MIa must be a load or store."); - assert(MIb && MIb->mayLoadOrStore() && "MIb must be a load or store."); + assert(MIa.mayLoadOrStore() && "MIa must be a load or store."); + assert(MIb.mayLoadOrStore() && "MIb must be a load or store."); - if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects() || - MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef()) + if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() || + MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) return false; // Retrieve the base register, offset from the base register and width. Width @@ -648,10 +693,10 @@ AArch64InstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, /// analyzeCompare - For a comparison instruction, return the source registers /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. /// Return true if the comparison instruction can be analyzed. -bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, +bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, unsigned &SrcReg2, int &CmpMask, int &CmpValue) const { - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: break; case AArch64::SUBSWrr: @@ -667,8 +712,8 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, case AArch64::ADDSXrs: case AArch64::ADDSXrx: // Replace SUBSWrr with SUBWrr if NZCV is not used. - SrcReg = MI->getOperand(1).getReg(); - SrcReg2 = MI->getOperand(2).getReg(); + SrcReg = MI.getOperand(1).getReg(); + SrcReg2 = MI.getOperand(2).getReg(); CmpMask = ~0; CmpValue = 0; return true; @@ -676,17 +721,17 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, case AArch64::ADDSWri: case AArch64::SUBSXri: case AArch64::ADDSXri: - SrcReg = MI->getOperand(1).getReg(); + SrcReg = MI.getOperand(1).getReg(); SrcReg2 = 0; CmpMask = ~0; // FIXME: In order to convert CmpValue to 0 or 1 - CmpValue = (MI->getOperand(2).getImm() != 0); + CmpValue = MI.getOperand(2).getImm() != 0; return true; case AArch64::ANDSWri: case AArch64::ANDSXri: // ANDS does not use the same encoding scheme as the others xxxS // instructions. - SrcReg = MI->getOperand(1).getReg(); + SrcReg = MI.getOperand(1).getReg(); SrcReg2 = 0; CmpMask = ~0; // FIXME:The return val type of decodeLogicalImmediate is uint64_t, @@ -694,17 +739,17 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, // the high 32 bits of uint64_t will be lost. // In fact it causes a bug in spec2006-483.xalancbmk // CmpValue is only used to compare with zero in OptimizeCompareInstr - CmpValue = (AArch64_AM::decodeLogicalImmediate( - MI->getOperand(2).getImm(), - MI->getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0); + CmpValue = AArch64_AM::decodeLogicalImmediate( + MI.getOperand(2).getImm(), + MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0; return true; } return false; } -static bool UpdateOperandRegClass(MachineInstr *Instr) { - MachineBasicBlock *MBB = Instr->getParent(); +static bool UpdateOperandRegClass(MachineInstr &Instr) { + MachineBasicBlock *MBB = Instr.getParent(); assert(MBB && "Can't get MachineBasicBlock here"); MachineFunction *MF = MBB->getParent(); assert(MF && "Can't get MachineFunction here"); @@ -712,11 +757,11 @@ static bool UpdateOperandRegClass(MachineInstr *Instr) { const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); MachineRegisterInfo *MRI = &MF->getRegInfo(); - for (unsigned OpIdx = 0, EndIdx = Instr->getNumOperands(); OpIdx < EndIdx; + for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx; ++OpIdx) { - MachineOperand &MO = Instr->getOperand(OpIdx); + MachineOperand &MO = Instr.getOperand(OpIdx); const TargetRegisterClass *OpRegCstraints = - Instr->getRegClassConstraint(OpIdx, TII, TRI); + Instr.getRegClassConstraint(OpIdx, TII, TRI); // If there's no constraint, there's nothing to do. if (!OpRegCstraints) @@ -744,16 +789,16 @@ static bool UpdateOperandRegClass(MachineInstr *Instr) { /// \brief Return the opcode that does not set flags when possible - otherwise /// return the original opcode. The caller is responsible to do the actual /// substitution and legality checking. -static unsigned convertFlagSettingOpcode(const MachineInstr *MI) { +static unsigned convertFlagSettingOpcode(const MachineInstr &MI) { // Don't convert all compare instructions, because for some the zero register // encoding becomes the sp register. bool MIDefinesZeroReg = false; - if (MI->definesRegister(AArch64::WZR) || MI->definesRegister(AArch64::XZR)) + if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR)) MIDefinesZeroReg = true; - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: - return MI->getOpcode(); + return MI.getOpcode(); case AArch64::ADDSWrr: return AArch64::ADDWrr; case AArch64::ADDSWri: @@ -789,60 +834,76 @@ static unsigned convertFlagSettingOpcode(const MachineInstr *MI) { } } -/// True when condition code could be modified on the instruction -/// trace starting at from and ending at to. -static bool modifiesConditionCode(MachineInstr *From, MachineInstr *To, - const bool CheckOnlyCCWrites, - const TargetRegisterInfo *TRI) { - // We iterate backward starting \p To until we hit \p From - MachineBasicBlock::iterator I = To, E = From, B = To->getParent()->begin(); +enum AccessKind { + AK_Write = 0x01, + AK_Read = 0x10, + AK_All = 0x11 +}; +/// True when condition flags are accessed (either by writing or reading) +/// on the instruction trace starting at From and ending at To. +/// +/// Note: If From and To are from different blocks it's assumed CC are accessed +/// on the path. +static bool areCFlagsAccessedBetweenInstrs( + MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, + const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) { // Early exit if To is at the beginning of the BB. - if (I == B) + if (To == To->getParent()->begin()) return true; - // Check whether the definition of SrcReg is in the same basic block as - // Compare. If not, assume the condition code gets modified on some path. + // Check whether the instructions are in the same basic block + // If not, assume the condition flags might get modified somewhere. if (To->getParent() != From->getParent()) return true; - // Check that NZCV isn't set on the trace. - for (--I; I != E; --I) { - const MachineInstr &Instr = *I; + // From must be above To. + assert(std::find_if(MachineBasicBlock::reverse_iterator(To), + To->getParent()->rend(), [From](MachineInstr &MI) { + return MachineBasicBlock::iterator(MI) == From; + }) != To->getParent()->rend()); - if (Instr.modifiesRegister(AArch64::NZCV, TRI) || - (!CheckOnlyCCWrites && Instr.readsRegister(AArch64::NZCV, TRI))) - // This instruction modifies or uses NZCV after the one we want to - // change. - return true; - if (I == B) - // We currently don't allow the instruction trace to cross basic - // block boundaries + // We iterate backward starting \p To until we hit \p From. + for (--To; To != From; --To) { + const MachineInstr &Instr = *To; + + if ( ((AccessToCheck & AK_Write) && Instr.modifiesRegister(AArch64::NZCV, TRI)) || + ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI))) return true; } return false; } -/// optimizeCompareInstr - Convert the instruction supplying the argument to the -/// comparison into one that sets the zero bit in the flags register. + +/// Try to optimize a compare instruction. A compare instruction is an +/// instruction which produces AArch64::NZCV. It can be truly compare instruction +/// when there are no uses of its destination register. +/// +/// The following steps are tried in order: +/// 1. Convert CmpInstr into an unconditional version. +/// 2. Remove CmpInstr if above there is an instruction producing a needed +/// condition code or an instruction which can be converted into such an instruction. +/// Only comparison with zero is supported. bool AArch64InstrInfo::optimizeCompareInstr( - MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask, + MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask, int CmpValue, const MachineRegisterInfo *MRI) const { + assert(CmpInstr.getParent()); + assert(MRI); // Replace SUBSWrr with SUBWrr if NZCV is not used. - int Cmp_NZCV = CmpInstr->findRegisterDefOperandIdx(AArch64::NZCV, true); - if (Cmp_NZCV != -1) { - if (CmpInstr->definesRegister(AArch64::WZR) || - CmpInstr->definesRegister(AArch64::XZR)) { - CmpInstr->eraseFromParent(); + int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true); + if (DeadNZCVIdx != -1) { + if (CmpInstr.definesRegister(AArch64::WZR) || + CmpInstr.definesRegister(AArch64::XZR)) { + CmpInstr.eraseFromParent(); return true; } - unsigned Opc = CmpInstr->getOpcode(); + unsigned Opc = CmpInstr.getOpcode(); unsigned NewOpc = convertFlagSettingOpcode(CmpInstr); if (NewOpc == Opc) return false; const MCInstrDesc &MCID = get(NewOpc); - CmpInstr->setDesc(MCID); - CmpInstr->RemoveOperand(Cmp_NZCV); + CmpInstr.setDesc(MCID); + CmpInstr.RemoveOperand(DeadNZCVIdx); bool succeeded = UpdateOperandRegClass(CmpInstr); (void)succeeded; assert(succeeded && "Some operands reg class are incompatible!"); @@ -857,23 +918,21 @@ bool AArch64InstrInfo::optimizeCompareInstr( return false; // CmpInstr is a Compare instruction if destination register is not used. - if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg())) - return false; - - // Get the unique definition of SrcReg. - MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg); - if (!MI) + if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg())) return false; - bool CheckOnlyCCWrites = false; - const TargetRegisterInfo *TRI = &getRegisterInfo(); - if (modifiesConditionCode(MI, CmpInstr, CheckOnlyCCWrites, TRI)) - return false; + return substituteCmpToZero(CmpInstr, SrcReg, MRI); +} - unsigned NewOpc = MI->getOpcode(); - switch (MI->getOpcode()) { +/// Get opcode of S version of Instr. +/// If Instr is S version its opcode is returned. +/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version +/// or we are not interested in it. +static unsigned sForm(MachineInstr &Instr) { + switch (Instr.getOpcode()) { default: - return false; + return AArch64::INSTRUCTION_LIST_END; + case AArch64::ADDSWrr: case AArch64::ADDSWri: case AArch64::ADDSXrr: @@ -882,116 +941,221 @@ bool AArch64InstrInfo::optimizeCompareInstr( case AArch64::SUBSWri: case AArch64::SUBSXrr: case AArch64::SUBSXri: - break; - case AArch64::ADDWrr: NewOpc = AArch64::ADDSWrr; break; - case AArch64::ADDWri: NewOpc = AArch64::ADDSWri; break; - case AArch64::ADDXrr: NewOpc = AArch64::ADDSXrr; break; - case AArch64::ADDXri: NewOpc = AArch64::ADDSXri; break; - case AArch64::ADCWr: NewOpc = AArch64::ADCSWr; break; - case AArch64::ADCXr: NewOpc = AArch64::ADCSXr; break; - case AArch64::SUBWrr: NewOpc = AArch64::SUBSWrr; break; - case AArch64::SUBWri: NewOpc = AArch64::SUBSWri; break; - case AArch64::SUBXrr: NewOpc = AArch64::SUBSXrr; break; - case AArch64::SUBXri: NewOpc = AArch64::SUBSXri; break; - case AArch64::SBCWr: NewOpc = AArch64::SBCSWr; break; - case AArch64::SBCXr: NewOpc = AArch64::SBCSXr; break; - case AArch64::ANDWri: NewOpc = AArch64::ANDSWri; break; - case AArch64::ANDXri: NewOpc = AArch64::ANDSXri; break; - } - - // Scan forward for the use of NZCV. - // When checking against MI: if it's a conditional code requires - // checking of V bit, then this is not safe to do. - // It is safe to remove CmpInstr if NZCV is redefined or killed. - // If we are done with the basic block, we need to check whether NZCV is - // live-out. - bool IsSafe = false; - for (MachineBasicBlock::iterator I = CmpInstr, - E = CmpInstr->getParent()->end(); - !IsSafe && ++I != E;) { - const MachineInstr &Instr = *I; - for (unsigned IO = 0, EO = Instr.getNumOperands(); !IsSafe && IO != EO; - ++IO) { - const MachineOperand &MO = Instr.getOperand(IO); - if (MO.isRegMask() && MO.clobbersPhysReg(AArch64::NZCV)) { - IsSafe = true; - break; - } - if (!MO.isReg() || MO.getReg() != AArch64::NZCV) - continue; - if (MO.isDef()) { - IsSafe = true; - break; - } + return Instr.getOpcode();; + + case AArch64::ADDWrr: return AArch64::ADDSWrr; + case AArch64::ADDWri: return AArch64::ADDSWri; + case AArch64::ADDXrr: return AArch64::ADDSXrr; + case AArch64::ADDXri: return AArch64::ADDSXri; + case AArch64::ADCWr: return AArch64::ADCSWr; + case AArch64::ADCXr: return AArch64::ADCSXr; + case AArch64::SUBWrr: return AArch64::SUBSWrr; + case AArch64::SUBWri: return AArch64::SUBSWri; + case AArch64::SUBXrr: return AArch64::SUBSXrr; + case AArch64::SUBXri: return AArch64::SUBSXri; + case AArch64::SBCWr: return AArch64::SBCSWr; + case AArch64::SBCXr: return AArch64::SBCSXr; + case AArch64::ANDWri: return AArch64::ANDSWri; + case AArch64::ANDXri: return AArch64::ANDSXri; + } +} - // Decode the condition code. - unsigned Opc = Instr.getOpcode(); - AArch64CC::CondCode CC; - switch (Opc) { - default: - return false; - case AArch64::Bcc: - CC = (AArch64CC::CondCode)Instr.getOperand(IO - 2).getImm(); - break; - case AArch64::CSINVWr: - case AArch64::CSINVXr: - case AArch64::CSINCWr: - case AArch64::CSINCXr: - case AArch64::CSELWr: - case AArch64::CSELXr: - case AArch64::CSNEGWr: - case AArch64::CSNEGXr: - case AArch64::FCSELSrrr: - case AArch64::FCSELDrrr: - CC = (AArch64CC::CondCode)Instr.getOperand(IO - 1).getImm(); - break; - } +/// Check if AArch64::NZCV should be alive in successors of MBB. +static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) { + for (auto *BB : MBB->successors()) + if (BB->isLiveIn(AArch64::NZCV)) + return true; + return false; +} - // It is not safe to remove Compare instruction if Overflow(V) is used. - switch (CC) { - default: - // NZCV can be used multiple times, we should continue. - break; - case AArch64CC::VS: - case AArch64CC::VC: - case AArch64CC::GE: - case AArch64CC::LT: - case AArch64CC::GT: - case AArch64CC::LE: - return false; - } +struct UsedNZCV { + bool N; + bool Z; + bool C; + bool V; + UsedNZCV(): N(false), Z(false), C(false), V(false) {} + UsedNZCV& operator |=(const UsedNZCV& UsedFlags) { + this->N |= UsedFlags.N; + this->Z |= UsedFlags.Z; + this->C |= UsedFlags.C; + this->V |= UsedFlags.V; + return *this; + } +}; + +/// Find a condition code used by the instruction. +/// Returns AArch64CC::Invalid if either the instruction does not use condition +/// codes or we don't optimize CmpInstr in the presence of such instructions. +static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) { + switch (Instr.getOpcode()) { + default: + return AArch64CC::Invalid; + + case AArch64::Bcc: { + int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); + assert(Idx >= 2); + return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm()); } + + case AArch64::CSINVWr: + case AArch64::CSINVXr: + case AArch64::CSINCWr: + case AArch64::CSINCXr: + case AArch64::CSELWr: + case AArch64::CSELXr: + case AArch64::CSNEGWr: + case AArch64::CSNEGXr: + case AArch64::FCSELSrrr: + case AArch64::FCSELDrrr: { + int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); + assert(Idx >= 1); + return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm()); + } + } +} + +static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) { + assert(CC != AArch64CC::Invalid); + UsedNZCV UsedFlags; + switch (CC) { + default: + break; + + case AArch64CC::EQ: // Z set + case AArch64CC::NE: // Z clear + UsedFlags.Z = true; + break; + + case AArch64CC::HI: // Z clear and C set + case AArch64CC::LS: // Z set or C clear + UsedFlags.Z = true; + case AArch64CC::HS: // C set + case AArch64CC::LO: // C clear + UsedFlags.C = true; + break; + + case AArch64CC::MI: // N set + case AArch64CC::PL: // N clear + UsedFlags.N = true; + break; + + case AArch64CC::VS: // V set + case AArch64CC::VC: // V clear + UsedFlags.V = true; + break; + + case AArch64CC::GT: // Z clear, N and V the same + case AArch64CC::LE: // Z set, N and V differ + UsedFlags.Z = true; + case AArch64CC::GE: // N and V the same + case AArch64CC::LT: // N and V differ + UsedFlags.N = true; + UsedFlags.V = true; + break; } + return UsedFlags; +} + +static bool isADDSRegImm(unsigned Opcode) { + return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri; +} + +static bool isSUBSRegImm(unsigned Opcode) { + return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri; +} + +/// Check if CmpInstr can be substituted by MI. +/// +/// CmpInstr can be substituted: +/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0' +/// - and, MI and CmpInstr are from the same MachineBB +/// - and, condition flags are not alive in successors of the CmpInstr parent +/// - and, if MI opcode is the S form there must be no defs of flags between +/// MI and CmpInstr +/// or if MI opcode is not the S form there must be neither defs of flags +/// nor uses of flags between MI and CmpInstr. +/// - and C/V flags are not used after CmpInstr +static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr, + const TargetRegisterInfo *TRI) { + assert(MI); + assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END); + assert(CmpInstr); + + const unsigned CmpOpcode = CmpInstr->getOpcode(); + if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode)) + return false; - // If NZCV is not killed nor re-defined, we should check whether it is - // live-out. If it is live-out, do not optimize. - if (!IsSafe) { - MachineBasicBlock *ParentBlock = CmpInstr->getParent(); - for (auto *MBB : ParentBlock->successors()) - if (MBB->isLiveIn(AArch64::NZCV)) + if (MI->getParent() != CmpInstr->getParent()) + return false; + + if (areCFlagsAliveInSuccessors(CmpInstr->getParent())) + return false; + + AccessKind AccessToCheck = AK_Write; + if (sForm(*MI) != MI->getOpcode()) + AccessToCheck = AK_All; + if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck)) + return false; + + UsedNZCV NZCVUsedAfterCmp; + for (auto I = std::next(CmpInstr->getIterator()), E = CmpInstr->getParent()->instr_end(); + I != E; ++I) { + const MachineInstr &Instr = *I; + if (Instr.readsRegister(AArch64::NZCV, TRI)) { + AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr); + if (CC == AArch64CC::Invalid) // Unsupported conditional instruction return false; + NZCVUsedAfterCmp |= getUsedNZCV(CC); + } + + if (Instr.modifiesRegister(AArch64::NZCV, TRI)) + break; } + + return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V; +} + +/// Substitute an instruction comparing to zero with another instruction +/// which produces needed condition flags. +/// +/// Return true on success. +bool AArch64InstrInfo::substituteCmpToZero( + MachineInstr &CmpInstr, unsigned SrcReg, + const MachineRegisterInfo *MRI) const { + assert(MRI); + // Get the unique definition of SrcReg. + MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg); + if (!MI) + return false; + + const TargetRegisterInfo *TRI = &getRegisterInfo(); + + unsigned NewOpc = sForm(*MI); + if (NewOpc == AArch64::INSTRUCTION_LIST_END) + return false; + + if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI)) + return false; // Update the instruction to set NZCV. MI->setDesc(get(NewOpc)); - CmpInstr->eraseFromParent(); - bool succeeded = UpdateOperandRegClass(MI); + CmpInstr.eraseFromParent(); + bool succeeded = UpdateOperandRegClass(*MI); (void)succeeded; assert(succeeded && "Some operands reg class are incompatible!"); MI->addRegisterDefined(AArch64::NZCV, TRI); return true; } -bool -AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { - if (MI->getOpcode() != TargetOpcode::LOAD_STACK_GUARD) +bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { + if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD) return false; - MachineBasicBlock &MBB = *MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); - unsigned Reg = MI->getOperand(0).getReg(); + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Reg = MI.getOperand(0).getReg(); const GlobalValue *GV = - cast<GlobalValue>((*MI->memoperands_begin())->getValue()); + cast<GlobalValue>((*MI.memoperands_begin())->getValue()); const TargetMachine &TM = MBB.getParent()->getTarget(); unsigned char OpFlags = Subtarget.ClassifyGlobalReference(GV, TM); const unsigned char MO_NC = AArch64II::MO_NC; @@ -1000,8 +1164,9 @@ AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg) .addGlobalAddress(GV, 0, AArch64II::MO_GOT); BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) - .addReg(Reg, RegState::Kill).addImm(0) - .addMemOperand(*MI->memoperands_begin()); + .addReg(Reg, RegState::Kill) + .addImm(0) + .addMemOperand(*MI.memoperands_begin()); } else if (TM.getCodeModel() == CodeModel::Large) { BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg) .addGlobalAddress(GV, 0, AArch64II::MO_G3).addImm(48); @@ -1015,8 +1180,9 @@ AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { .addReg(Reg, RegState::Kill) .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC).addImm(0); BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) - .addReg(Reg, RegState::Kill).addImm(0) - .addMemOperand(*MI->memoperands_begin()); + .addReg(Reg, RegState::Kill) + .addImm(0) + .addMemOperand(*MI.memoperands_begin()); } else { BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg) .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE); @@ -1024,7 +1190,7 @@ AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) .addReg(Reg, RegState::Kill) .addGlobalAddress(GV, 0, LoFlags) - .addMemOperand(*MI->memoperands_begin()); + .addMemOperand(*MI.memoperands_begin()); } MBB.erase(MI); @@ -1033,8 +1199,8 @@ AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { } /// Return true if this is this instruction has a non-zero immediate -bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const { - switch (MI->getOpcode()) { +bool AArch64InstrInfo::hasShiftedReg(const MachineInstr &MI) const { + switch (MI.getOpcode()) { default: break; case AArch64::ADDSWrs: @@ -1069,8 +1235,8 @@ bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const { case AArch64::SUBSXrs: case AArch64::SUBWrs: case AArch64::SUBXrs: - if (MI->getOperand(3).isImm()) { - unsigned val = MI->getOperand(3).getImm(); + if (MI.getOperand(3).isImm()) { + unsigned val = MI.getOperand(3).getImm(); return (val != 0); } break; @@ -1079,8 +1245,8 @@ bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const { } /// Return true if this is this instruction has a non-zero immediate -bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const { - switch (MI->getOpcode()) { +bool AArch64InstrInfo::hasExtendedReg(const MachineInstr &MI) const { + switch (MI.getOpcode()) { default: break; case AArch64::ADDSWrx: @@ -1095,8 +1261,8 @@ bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const { case AArch64::SUBWrx: case AArch64::SUBXrx: case AArch64::SUBXrx64: - if (MI->getOperand(3).isImm()) { - unsigned val = MI->getOperand(3).getImm(); + if (MI.getOperand(3).isImm()) { + unsigned val = MI.getOperand(3).getImm(); return (val != 0); } break; @@ -1107,51 +1273,51 @@ bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const { // Return true if this instruction simply sets its single destination register // to zero. This is equivalent to a register rename of the zero-register. -bool AArch64InstrInfo::isGPRZero(const MachineInstr *MI) const { - switch (MI->getOpcode()) { +bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) const { + switch (MI.getOpcode()) { default: break; case AArch64::MOVZWi: case AArch64::MOVZXi: // movz Rd, #0 (LSL #0) - if (MI->getOperand(1).isImm() && MI->getOperand(1).getImm() == 0) { - assert(MI->getDesc().getNumOperands() == 3 && - MI->getOperand(2).getImm() == 0 && "invalid MOVZi operands"); + if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) { + assert(MI.getDesc().getNumOperands() == 3 && + MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands"); return true; } break; case AArch64::ANDWri: // and Rd, Rzr, #imm - return MI->getOperand(1).getReg() == AArch64::WZR; + return MI.getOperand(1).getReg() == AArch64::WZR; case AArch64::ANDXri: - return MI->getOperand(1).getReg() == AArch64::XZR; + return MI.getOperand(1).getReg() == AArch64::XZR; case TargetOpcode::COPY: - return MI->getOperand(1).getReg() == AArch64::WZR; + return MI.getOperand(1).getReg() == AArch64::WZR; } return false; } // Return true if this instruction simply renames a general register without // modifying bits. -bool AArch64InstrInfo::isGPRCopy(const MachineInstr *MI) const { - switch (MI->getOpcode()) { +bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) const { + switch (MI.getOpcode()) { default: break; case TargetOpcode::COPY: { // GPR32 copies will by lowered to ORRXrs - unsigned DstReg = MI->getOperand(0).getReg(); + unsigned DstReg = MI.getOperand(0).getReg(); return (AArch64::GPR32RegClass.contains(DstReg) || AArch64::GPR64RegClass.contains(DstReg)); } case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0) - if (MI->getOperand(1).getReg() == AArch64::XZR) { - assert(MI->getDesc().getNumOperands() == 4 && - MI->getOperand(3).getImm() == 0 && "invalid ORRrs operands"); + if (MI.getOperand(1).getReg() == AArch64::XZR) { + assert(MI.getDesc().getNumOperands() == 4 && + MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands"); return true; } break; case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0) - if (MI->getOperand(2).getImm() == 0) { - assert(MI->getDesc().getNumOperands() == 4 && - MI->getOperand(3).getImm() == 0 && "invalid ADDXri operands"); + if (MI.getOperand(2).getImm() == 0) { + assert(MI.getDesc().getNumOperands() == 4 && + MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands"); return true; } break; @@ -1161,19 +1327,19 @@ bool AArch64InstrInfo::isGPRCopy(const MachineInstr *MI) const { // Return true if this instruction simply renames a general register without // modifying bits. -bool AArch64InstrInfo::isFPRCopy(const MachineInstr *MI) const { - switch (MI->getOpcode()) { +bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) const { + switch (MI.getOpcode()) { default: break; case TargetOpcode::COPY: { // FPR64 copies will by lowered to ORR.16b - unsigned DstReg = MI->getOperand(0).getReg(); + unsigned DstReg = MI.getOperand(0).getReg(); return (AArch64::FPR64RegClass.contains(DstReg) || AArch64::FPR128RegClass.contains(DstReg)); } case AArch64::ORRv16i8: - if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) { - assert(MI->getDesc().getNumOperands() == 3 && MI->getOperand(0).isReg() && + if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) { + assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() && "invalid ORRv16i8 operands"); return true; } @@ -1182,9 +1348,9 @@ bool AArch64InstrInfo::isFPRCopy(const MachineInstr *MI) const { return false; } -unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI, +unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const { - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: break; case AArch64::LDRWui: @@ -1194,10 +1360,10 @@ unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI, case AArch64::LDRSui: case AArch64::LDRDui: case AArch64::LDRQui: - if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() && - MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); + if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && + MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { + FrameIndex = MI.getOperand(1).getIndex(); + return MI.getOperand(0).getReg(); } break; } @@ -1205,9 +1371,9 @@ unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI, return 0; } -unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI, +unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const { - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: break; case AArch64::STRWui: @@ -1217,10 +1383,10 @@ unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI, case AArch64::STRSui: case AArch64::STRDui: case AArch64::STRQui: - if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() && - MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); + if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && + MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { + FrameIndex = MI.getOperand(1).getIndex(); + return MI.getOperand(0).getReg(); } break; } @@ -1230,8 +1396,8 @@ unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI, /// Return true if this is load/store scales or extends its register offset. /// This refers to scaling a dynamic index as opposed to scaled immediates. /// MI should be a memory op that allows scaled addressing. -bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const { - switch (MI->getOpcode()) { +bool AArch64InstrInfo::isScaledAddr(const MachineInstr &MI) const { + switch (MI.getOpcode()) { default: break; case AArch64::LDRBBroW: @@ -1281,7 +1447,7 @@ bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const { case AArch64::STRWroX: case AArch64::STRXroX: - unsigned Val = MI->getOperand(3).getImm(); + unsigned Val = MI.getOperand(3).getImm(); AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getMemExtendType(Val); return (ExtType != AArch64_AM::UXTX) || AArch64_AM::getMemDoShift(Val); } @@ -1289,36 +1455,96 @@ bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const { } /// Check all MachineMemOperands for a hint to suppress pairing. -bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr *MI) const { - assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) && - "Too many target MO flags"); - for (auto *MM : MI->memoperands()) { - if (MM->getFlags() & - (MOSuppressPair << MachineMemOperand::MOTargetStartBit)) { - return true; - } - } - return false; +bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) const { + return any_of(MI.memoperands(), [](MachineMemOperand *MMO) { + return MMO->getFlags() & MOSuppressPair; + }); } /// Set a flag on the first MachineMemOperand to suppress pairing. -void AArch64InstrInfo::suppressLdStPair(MachineInstr *MI) const { - if (MI->memoperands_empty()) +void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) const { + if (MI.memoperands_empty()) return; + (*MI.memoperands_begin())->setFlags(MOSuppressPair); +} - assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) && - "Too many target MO flags"); - (*MI->memoperands_begin()) - ->setFlags(MOSuppressPair << MachineMemOperand::MOTargetStartBit); +bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) const { + switch (Opc) { + default: + return false; + case AArch64::STURSi: + case AArch64::STURDi: + case AArch64::STURQi: + case AArch64::STURBBi: + case AArch64::STURHHi: + case AArch64::STURWi: + case AArch64::STURXi: + case AArch64::LDURSi: + case AArch64::LDURDi: + case AArch64::LDURQi: + case AArch64::LDURWi: + case AArch64::LDURXi: + case AArch64::LDURSWi: + case AArch64::LDURHHi: + case AArch64::LDURBBi: + case AArch64::LDURSBWi: + case AArch64::LDURSHWi: + return true; + } } -bool -AArch64InstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, - unsigned &Offset, - const TargetRegisterInfo *TRI) const { - switch (LdSt->getOpcode()) { +bool AArch64InstrInfo::isUnscaledLdSt(MachineInstr &MI) const { + return isUnscaledLdSt(MI.getOpcode()); +} + +// Is this a candidate for ld/st merging or pairing? For example, we don't +// touch volatiles or load/stores that have a hint to avoid pair formation. +bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr &MI) const { + // If this is a volatile load/store, don't mess with it. + if (MI.hasOrderedMemoryRef()) + return false; + + // Make sure this is a reg+imm (as opposed to an address reloc). + assert(MI.getOperand(1).isReg() && "Expected a reg operand."); + if (!MI.getOperand(2).isImm()) + return false; + + // Can't merge/pair if the instruction modifies the base register. + // e.g., ldr x0, [x0] + unsigned BaseReg = MI.getOperand(1).getReg(); + const TargetRegisterInfo *TRI = &getRegisterInfo(); + if (MI.modifiesRegister(BaseReg, TRI)) + return false; + + // Check if this load/store has a hint to avoid pair formation. + // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. + if (isLdStPairSuppressed(MI)) + return false; + + // On some CPUs quad load/store pairs are slower than two single load/stores. + if (Subtarget.avoidQuadLdStPairs()) { + switch (MI.getOpcode()) { + default: + break; + + case AArch64::LDURQi: + case AArch64::STURQi: + case AArch64::LDRQui: + case AArch64::STRQui: + return false; + } + } + + return true; +} + +bool AArch64InstrInfo::getMemOpBaseRegImmOfs( + MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset, + const TargetRegisterInfo *TRI) const { + switch (LdSt.getOpcode()) { default: return false; + // Scaled instructions. case AArch64::STRSui: case AArch64::STRDui: case AArch64::STRQui: @@ -1329,29 +1555,45 @@ AArch64InstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, case AArch64::LDRQui: case AArch64::LDRXui: case AArch64::LDRWui: - if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm()) - return false; - BaseReg = LdSt->getOperand(1).getReg(); - MachineFunction &MF = *LdSt->getParent()->getParent(); - unsigned Width = getRegClass(LdSt->getDesc(), 0, TRI, MF)->getSize(); - Offset = LdSt->getOperand(2).getImm() * Width; - return true; + case AArch64::LDRSWui: + // Unscaled instructions. + case AArch64::STURSi: + case AArch64::STURDi: + case AArch64::STURQi: + case AArch64::STURXi: + case AArch64::STURWi: + case AArch64::LDURSi: + case AArch64::LDURDi: + case AArch64::LDURQi: + case AArch64::LDURWi: + case AArch64::LDURXi: + case AArch64::LDURSWi: + unsigned Width; + return getMemOpBaseRegImmOfsWidth(LdSt, BaseReg, Offset, Width, TRI); }; } bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( - MachineInstr *LdSt, unsigned &BaseReg, int &Offset, int &Width, + MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset, unsigned &Width, const TargetRegisterInfo *TRI) const { + assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); // Handle only loads/stores with base register followed by immediate offset. - if (LdSt->getNumOperands() != 3) - return false; - if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm()) + if (LdSt.getNumExplicitOperands() == 3) { + // Non-paired instruction (e.g., ldr x1, [x0, #8]). + if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isImm()) + return false; + } else if (LdSt.getNumExplicitOperands() == 4) { + // Paired instruction (e.g., ldp x1, x2, [x0, #8]). + if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isReg() || + !LdSt.getOperand(3).isImm()) + return false; + } else return false; // Offset is calculated as the immediate operand multiplied by the scaling factor. // Unscaled instructions have scaling factor set to 1. - int Scale = 0; - switch (LdSt->getOpcode()) { + unsigned Scale = 0; + switch (LdSt.getOpcode()) { default: return false; case AArch64::LDURQi: @@ -1392,18 +1634,48 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( Width = 1; Scale = 1; break; + case AArch64::LDPQi: + case AArch64::LDNPQi: + case AArch64::STPQi: + case AArch64::STNPQi: + Scale = 16; + Width = 32; + break; case AArch64::LDRQui: case AArch64::STRQui: Scale = Width = 16; break; + case AArch64::LDPXi: + case AArch64::LDPDi: + case AArch64::LDNPXi: + case AArch64::LDNPDi: + case AArch64::STPXi: + case AArch64::STPDi: + case AArch64::STNPXi: + case AArch64::STNPDi: + Scale = 8; + Width = 16; + break; case AArch64::LDRXui: case AArch64::LDRDui: case AArch64::STRXui: case AArch64::STRDui: Scale = Width = 8; break; + case AArch64::LDPWi: + case AArch64::LDPSi: + case AArch64::LDNPWi: + case AArch64::LDNPSi: + case AArch64::STPWi: + case AArch64::STPSi: + case AArch64::STNPWi: + case AArch64::STNPSi: + Scale = 4; + Width = 8; + break; case AArch64::LDRWui: case AArch64::LDRSui: + case AArch64::LDRSWui: case AArch64::STRWui: case AArch64::STRSui: Scale = Width = 4; @@ -1420,41 +1692,120 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( case AArch64::STRBBui: Scale = Width = 1; break; - }; + } - BaseReg = LdSt->getOperand(1).getReg(); - Offset = LdSt->getOperand(2).getImm() * Scale; + if (LdSt.getNumExplicitOperands() == 3) { + BaseReg = LdSt.getOperand(1).getReg(); + Offset = LdSt.getOperand(2).getImm() * Scale; + } else { + assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands"); + BaseReg = LdSt.getOperand(2).getReg(); + Offset = LdSt.getOperand(3).getImm() * Scale; + } return true; } +// Scale the unscaled offsets. Returns false if the unscaled offset can't be +// scaled. +static bool scaleOffset(unsigned Opc, int64_t &Offset) { + unsigned OffsetStride = 1; + switch (Opc) { + default: + return false; + case AArch64::LDURQi: + case AArch64::STURQi: + OffsetStride = 16; + break; + case AArch64::LDURXi: + case AArch64::LDURDi: + case AArch64::STURXi: + case AArch64::STURDi: + OffsetStride = 8; + break; + case AArch64::LDURWi: + case AArch64::LDURSi: + case AArch64::LDURSWi: + case AArch64::STURWi: + case AArch64::STURSi: + OffsetStride = 4; + break; + } + // If the byte-offset isn't a multiple of the stride, we can't scale this + // offset. + if (Offset % OffsetStride != 0) + return false; + + // Convert the byte-offset used by unscaled into an "element" offset used + // by the scaled pair load/store instructions. + Offset /= OffsetStride; + return true; +} + +static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) { + if (FirstOpc == SecondOpc) + return true; + // We can also pair sign-ext and zero-ext instructions. + switch (FirstOpc) { + default: + return false; + case AArch64::LDRWui: + case AArch64::LDURWi: + return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi; + case AArch64::LDRSWui: + case AArch64::LDURSWi: + return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi; + } + // These instructions can't be paired based on their opcodes. + return false; +} + /// Detect opportunities for ldp/stp formation. /// /// Only called for LdSt for which getMemOpBaseRegImmOfs returns true. -bool AArch64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, - MachineInstr *SecondLdSt, - unsigned NumLoads) const { +bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, + MachineInstr &SecondLdSt, + unsigned NumLoads) const { // Only cluster up to a single pair. if (NumLoads > 1) return false; - if (FirstLdSt->getOpcode() != SecondLdSt->getOpcode()) + + // Can we pair these instructions based on their opcodes? + unsigned FirstOpc = FirstLdSt.getOpcode(); + unsigned SecondOpc = SecondLdSt.getOpcode(); + if (!canPairLdStOpc(FirstOpc, SecondOpc)) + return false; + + // Can't merge volatiles or load/stores that have a hint to avoid pair + // formation, for example. + if (!isCandidateToMergeOrPair(FirstLdSt) || + !isCandidateToMergeOrPair(SecondLdSt)) + return false; + + // isCandidateToMergeOrPair guarantees that operand 2 is an immediate. + int64_t Offset1 = FirstLdSt.getOperand(2).getImm(); + if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1)) + return false; + + int64_t Offset2 = SecondLdSt.getOperand(2).getImm(); + if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2)) return false; - // getMemOpBaseRegImmOfs guarantees that oper 2 isImm. - unsigned Ofs1 = FirstLdSt->getOperand(2).getImm(); - // Allow 6 bits of positive range. - if (Ofs1 > 64) + + // Pairwise instructions have a 7-bit signed offset field. + if (Offset1 > 63 || Offset1 < -64) return false; + // The caller should already have ordered First/SecondLdSt by offset. - unsigned Ofs2 = SecondLdSt->getOperand(2).getImm(); - return Ofs1 + 1 == Ofs2; + assert(Offset1 <= Offset2 && "Caller should have ordered offsets."); + return Offset1 + 1 == Offset2; } -bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First, - MachineInstr *Second) const { - if (Subtarget.isCyclone()) { - // Cyclone can fuse CMN, CMP, TST followed by Bcc. - unsigned SecondOpcode = Second->getOpcode(); +bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr &First, + MachineInstr &Second) const { + if (Subtarget.hasMacroOpFusion()) { + // Fuse CMN, CMP, TST followed by Bcc. + unsigned SecondOpcode = Second.getOpcode(); if (SecondOpcode == AArch64::Bcc) { - switch (First->getOpcode()) { + switch (First.getOpcode()) { default: return false; case AArch64::SUBSWri: @@ -1466,10 +1817,10 @@ bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First, return true; } } - // Cyclone B0 also supports ALU operations followed by CBZ/CBNZ. + // Fuse ALU operations followed by CBZ/CBNZ. if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX || SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) { - switch (First->getOpcode()) { + switch (First.getOpcode()) { default: return false; case AArch64::ADDWri: @@ -1491,7 +1842,7 @@ bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First, MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue( MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var, - const MDNode *Expr, DebugLoc DL) const { + const MDNode *Expr, const DebugLoc &DL) const { MachineInstrBuilder MIB = BuildMI(MF, DL, get(AArch64::DBG_VALUE)) .addFrameIndex(FrameIx) .addImm(0) @@ -1521,7 +1872,7 @@ static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, } void AArch64InstrInfo::copyPhysRegTuple( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, + MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode, llvm::ArrayRef<unsigned> Indices) const { assert(Subtarget.hasNEON() && @@ -1547,9 +1898,9 @@ void AArch64InstrInfo::copyPhysRegTuple( } void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, - bool KillSrc) const { + MachineBasicBlock::iterator I, + const DebugLoc &DL, unsigned DestReg, + unsigned SrcReg, bool KillSrc) const { if (AArch64::GPR32spRegClass.contains(DestReg) && (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) { const TargetRegisterInfo *TRI = &getRegisterInfo(); @@ -1818,8 +2169,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (SrcReg == AArch64::NZCV) { assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy"); - BuildMI(MBB, I, DL, get(AArch64::MRS)) - .addReg(DestReg) + BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg) .addImm(AArch64SysReg::NZCV) .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc)); return; @@ -1879,39 +2229,45 @@ void AArch64InstrInfo::storeRegToStackSlot( else if (AArch64::DDRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); - Opc = AArch64::ST1Twov1d, Offset = false; + Opc = AArch64::ST1Twov1d; + Offset = false; } break; case 24: if (AArch64::DDDRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); - Opc = AArch64::ST1Threev1d, Offset = false; + Opc = AArch64::ST1Threev1d; + Offset = false; } break; case 32: if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); - Opc = AArch64::ST1Fourv1d, Offset = false; + Opc = AArch64::ST1Fourv1d; + Offset = false; } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); - Opc = AArch64::ST1Twov2d, Offset = false; + Opc = AArch64::ST1Twov2d; + Offset = false; } break; case 48: if (AArch64::QQQRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); - Opc = AArch64::ST1Threev2d, Offset = false; + Opc = AArch64::ST1Threev2d; + Offset = false; } break; case 64: if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); - Opc = AArch64::ST1Fourv2d, Offset = false; + Opc = AArch64::ST1Fourv2d; + Offset = false; } break; } @@ -1977,39 +2333,45 @@ void AArch64InstrInfo::loadRegFromStackSlot( else if (AArch64::DDRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); - Opc = AArch64::LD1Twov1d, Offset = false; + Opc = AArch64::LD1Twov1d; + Offset = false; } break; case 24: if (AArch64::DDDRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); - Opc = AArch64::LD1Threev1d, Offset = false; + Opc = AArch64::LD1Threev1d; + Offset = false; } break; case 32: if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); - Opc = AArch64::LD1Fourv1d, Offset = false; + Opc = AArch64::LD1Fourv1d; + Offset = false; } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); - Opc = AArch64::LD1Twov2d, Offset = false; + Opc = AArch64::LD1Twov2d; + Offset = false; } break; case 48: if (AArch64::QQQRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); - Opc = AArch64::LD1Threev2d, Offset = false; + Opc = AArch64::LD1Threev2d; + Offset = false; } break; case 64: if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); - Opc = AArch64::LD1Fourv2d, Offset = false; + Opc = AArch64::LD1Fourv2d; + Offset = false; } break; } @@ -2024,13 +2386,16 @@ void AArch64InstrInfo::loadRegFromStackSlot( } void llvm::emitFrameOffset(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, DebugLoc DL, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, int Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag Flag, bool SetNZCV) { if (DestReg == SrcReg && Offset == 0) return; + assert((DestReg != AArch64::SP || Offset % 16 == 0) && + "SP increment/decrement not 16-byte aligned"); + bool isSub = Offset < 0; if (isSub) Offset = -Offset; @@ -2082,8 +2447,9 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB, } MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( - MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops, - MachineBasicBlock::iterator InsertPt, int FrameIndex) const { + MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, + MachineBasicBlock::iterator InsertPt, int FrameIndex, + LiveIntervals *LIS) const { // This is a bit of a hack. Consider this instruction: // // %vreg0<def> = COPY %SP; GPR64all:%vreg0 @@ -2097,9 +2463,9 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( // // <rdar://problem/11522048> // - if (MI->isCopy()) { - unsigned DstReg = MI->getOperand(0).getReg(); - unsigned SrcReg = MI->getOperand(1).getReg(); + if (MI.isCopy()) { + unsigned DstReg = MI.getOperand(0).getReg(); + unsigned SrcReg = MI.getOperand(1).getReg(); if (SrcReg == AArch64::SP && TargetRegisterInfo::isVirtualRegister(DstReg)) { MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass); @@ -2393,9 +2759,10 @@ void AArch64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { NopInst.setOpcode(AArch64::HINT); NopInst.addOperand(MCOperand::createImm(0)); } -/// useMachineCombiner - return true when a target supports MachineCombiner + +// AArch64 supports MachineCombiner. bool AArch64InstrInfo::useMachineCombiner() const { - // AArch64 supports the combiner + return true; } // @@ -2456,37 +2823,75 @@ static bool isCombineInstrCandidate64(unsigned Opc) { return false; } // +// FP Opcodes that can be combined with a FMUL +static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { + switch (Inst.getOpcode()) { + case AArch64::FADDSrr: + case AArch64::FADDDrr: + case AArch64::FADDv2f32: + case AArch64::FADDv2f64: + case AArch64::FADDv4f32: + case AArch64::FSUBSrr: + case AArch64::FSUBDrr: + case AArch64::FSUBv2f32: + case AArch64::FSUBv2f64: + case AArch64::FSUBv4f32: + return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath; + default: + break; + } + return false; +} +// // Opcodes that can be combined with a MUL static bool isCombineInstrCandidate(unsigned Opc) { return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc)); } -static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, - unsigned MulOpc, unsigned ZeroReg) { +// +// Utility routine that checks if \param MO is defined by an +// \param CombineOpc instruction in the basic block \param MBB +static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, + unsigned CombineOpc, unsigned ZeroReg = 0, + bool CheckZeroReg = false) { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineInstr *MI = nullptr; - // We need a virtual register definition. + if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) MI = MRI.getUniqueVRegDef(MO.getReg()); // And it needs to be in the trace (otherwise, it won't have a depth). - if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != MulOpc) - return false; - - assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() && - MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && - MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs"); - - // The third input reg must be zero. - if (MI->getOperand(3).getReg() != ZeroReg) + if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc) return false; - // Must only used by the user we combine with. if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) return false; + if (CheckZeroReg) { + assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() && + MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && + MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs"); + // The third input reg must be zero. + if (MI->getOperand(3).getReg() != ZeroReg) + return false; + } + return true; } +// +// Is \param MO defined by an integer multiply and can be combined? +static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, + unsigned MulOpc, unsigned ZeroReg) { + return canCombine(MBB, MO, MulOpc, ZeroReg, true); +} + +// +// Is \param MO defined by a floating-point multiply and can be combined? +static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, + unsigned MulOpc) { + return canCombine(MBB, MO, MulOpc); +} + // TODO: There are many more machine instruction opcodes to match: // 1. Other data types (integer, vectors) // 2. Other math / logic operations (xor, or) @@ -2522,17 +2927,17 @@ static bool getMaddPatterns(MachineInstr &Root, bool Found = false; if (!isCombineInstrCandidate(Opc)) - return 0; + return false; if (isCombineInstrSettingFlag(Opc)) { int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true); // When NZCV is live bail out. if (Cmp_NZCV == -1) - return 0; - unsigned NewOpc = convertFlagSettingOpcode(&Root); + return false; + unsigned NewOpc = convertFlagSettingOpcode(Root); // When opcode can't change bail out. // CHECKME: do we miss any cases for opcode conversion? if (NewOpc == Opc) - return 0; + return false; Opc = NewOpc; } @@ -2620,7 +3025,230 @@ static bool getMaddPatterns(MachineInstr &Root, } return Found; } +/// Floating-Point Support + +/// Find instructions that can be turned into madd. +static bool getFMAPatterns(MachineInstr &Root, + SmallVectorImpl<MachineCombinerPattern> &Patterns) { + + if (!isCombineInstrCandidateFP(Root)) + return 0; + MachineBasicBlock &MBB = *Root.getParent(); + bool Found = false; + + switch (Root.getOpcode()) { + default: + assert(false && "Unsupported FP instruction in combiner\n"); + break; + case AArch64::FADDSrr: + assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && + "FADDWrr does not have register operands"); + if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) { + Patterns.push_back(MachineCombinerPattern::FMULADDS_OP1); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv1i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP1); + Found = true; + } + if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) { + Patterns.push_back(MachineCombinerPattern::FMULADDS_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv1i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP2); + Found = true; + } + break; + case AArch64::FADDDrr: + if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) { + Patterns.push_back(MachineCombinerPattern::FMULADDD_OP1); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv1i64_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP1); + Found = true; + } + if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) { + Patterns.push_back(MachineCombinerPattern::FMULADDD_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv1i64_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP2); + Found = true; + } + break; + case AArch64::FADDv2f32: + if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv2i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP1); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv2f32)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP1); + Found = true; + } + if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2f32)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP2); + Found = true; + } + break; + case AArch64::FADDv2f64: + if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv2i64_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP1); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv2f64)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP1); + Found = true; + } + if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2i64_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2f64)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP2); + Found = true; + } + break; + case AArch64::FADDv4f32: + if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv4i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP1); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv4f32)) { + Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP1); + Found = true; + } + if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv4i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv4f32)) { + Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP2); + Found = true; + } + break; + + case AArch64::FSUBSrr: + if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) { + Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP1); + Found = true; + } + if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) { + Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv1i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2); + Found = true; + } + break; + case AArch64::FSUBDrr: + if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) { + Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP1); + Found = true; + } + if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) { + Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv1i64_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2); + Found = true; + } + break; + case AArch64::FSUBv2f32: + if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2f32)) { + Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP2); + Found = true; + } + break; + case AArch64::FSUBv2f64: + if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2i64_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2f64)) { + Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP2); + Found = true; + } + break; + case AArch64::FSUBv4f32: + if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv4i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv4f32)) { + Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP2); + Found = true; + } + break; + } + return Found; +} + +/// Return true when a code sequence can improve throughput. It +/// should be called only for instructions in loops. +/// \param Pattern - combiner pattern +bool +AArch64InstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const { + switch (Pattern) { + default: + break; + case MachineCombinerPattern::FMULADDS_OP1: + case MachineCombinerPattern::FMULADDS_OP2: + case MachineCombinerPattern::FMULSUBS_OP1: + case MachineCombinerPattern::FMULSUBS_OP2: + case MachineCombinerPattern::FMULADDD_OP1: + case MachineCombinerPattern::FMULADDD_OP2: + case MachineCombinerPattern::FMULSUBD_OP1: + case MachineCombinerPattern::FMULSUBD_OP2: + case MachineCombinerPattern::FMLAv1i32_indexed_OP1: + case MachineCombinerPattern::FMLAv1i32_indexed_OP2: + case MachineCombinerPattern::FMLAv1i64_indexed_OP1: + case MachineCombinerPattern::FMLAv1i64_indexed_OP2: + case MachineCombinerPattern::FMLAv2f32_OP2: + case MachineCombinerPattern::FMLAv2f32_OP1: + case MachineCombinerPattern::FMLAv2f64_OP1: + case MachineCombinerPattern::FMLAv2f64_OP2: + case MachineCombinerPattern::FMLAv2i32_indexed_OP1: + case MachineCombinerPattern::FMLAv2i32_indexed_OP2: + case MachineCombinerPattern::FMLAv2i64_indexed_OP1: + case MachineCombinerPattern::FMLAv2i64_indexed_OP2: + case MachineCombinerPattern::FMLAv4f32_OP1: + case MachineCombinerPattern::FMLAv4f32_OP2: + case MachineCombinerPattern::FMLAv4i32_indexed_OP1: + case MachineCombinerPattern::FMLAv4i32_indexed_OP2: + case MachineCombinerPattern::FMLSv1i32_indexed_OP2: + case MachineCombinerPattern::FMLSv1i64_indexed_OP2: + case MachineCombinerPattern::FMLSv2i32_indexed_OP2: + case MachineCombinerPattern::FMLSv2i64_indexed_OP2: + case MachineCombinerPattern::FMLSv2f32_OP2: + case MachineCombinerPattern::FMLSv2f64_OP2: + case MachineCombinerPattern::FMLSv4i32_indexed_OP2: + case MachineCombinerPattern::FMLSv4f32_OP2: + return true; + } // end switch (Pattern) + return false; +} /// Return true when there is potentially a faster code sequence for an /// instruction chain ending in \p Root. All potential patterns are listed in /// the \p Pattern vector. Pattern should be sorted in priority order since the @@ -2629,28 +3257,35 @@ static bool getMaddPatterns(MachineInstr &Root, bool AArch64InstrInfo::getMachineCombinerPatterns( MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns) const { + // Integer patterns if (getMaddPatterns(Root, Patterns)) return true; + // Floating point patterns + if (getFMAPatterns(Root, Patterns)) + return true; return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns); } -/// genMadd - Generate madd instruction and combine mul and add. -/// Example: -/// MUL I=A,B,0 -/// ADD R,I,C -/// ==> MADD R,A,B,C -/// \param Root is the ADD instruction +enum class FMAInstKind { Default, Indexed, Accumulator }; +/// genFusedMultiply - Generate fused multiply instructions. +/// This function supports both integer and floating point instructions. +/// A typical example: +/// F|MUL I=A,B,0 +/// F|ADD R,I,C +/// ==> F|MADD R,A,B,C +/// \param Root is the F|ADD instruction /// \param [out] InsInstrs is a vector of machine instructions and will /// contain the generated madd instruction /// \param IdxMulOpd is index of operand in Root that is the result of -/// the MUL. In the example above IdxMulOpd is 1. -/// \param MaddOpc the opcode fo the madd instruction -static MachineInstr *genMadd(MachineFunction &MF, MachineRegisterInfo &MRI, - const TargetInstrInfo *TII, MachineInstr &Root, - SmallVectorImpl<MachineInstr *> &InsInstrs, - unsigned IdxMulOpd, unsigned MaddOpc, - const TargetRegisterClass *RC) { +/// the F|MUL. In the example above IdxMulOpd is 1. +/// \param MaddOpc the opcode fo the f|madd instruction +static MachineInstr * +genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, + const TargetInstrInfo *TII, MachineInstr &Root, + SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd, + unsigned MaddOpc, const TargetRegisterClass *RC, + FMAInstKind kind = FMAInstKind::Default) { assert(IdxMulOpd == 1 || IdxMulOpd == 2); unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1; @@ -2672,12 +3307,26 @@ static MachineInstr *genMadd(MachineFunction &MF, MachineRegisterInfo &MRI, if (TargetRegisterInfo::isVirtualRegister(SrcReg2)) MRI.constrainRegClass(SrcReg2, RC); - MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), - ResultReg) - .addReg(SrcReg0, getKillRegState(Src0IsKill)) - .addReg(SrcReg1, getKillRegState(Src1IsKill)) - .addReg(SrcReg2, getKillRegState(Src2IsKill)); - // Insert the MADD + MachineInstrBuilder MIB; + if (kind == FMAInstKind::Default) + MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) + .addReg(SrcReg0, getKillRegState(Src0IsKill)) + .addReg(SrcReg1, getKillRegState(Src1IsKill)) + .addReg(SrcReg2, getKillRegState(Src2IsKill)); + else if (kind == FMAInstKind::Indexed) + MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) + .addReg(SrcReg2, getKillRegState(Src2IsKill)) + .addReg(SrcReg0, getKillRegState(Src0IsKill)) + .addReg(SrcReg1, getKillRegState(Src1IsKill)) + .addImm(MUL->getOperand(3).getImm()); + else if (kind == FMAInstKind::Accumulator) + MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) + .addReg(SrcReg2, getKillRegState(Src2IsKill)) + .addReg(SrcReg0, getKillRegState(Src0IsKill)) + .addReg(SrcReg1, getKillRegState(Src1IsKill)); + else + assert(false && "Invalid FMA instruction kind \n"); + // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL) InsInstrs.push_back(MIB); return MUL; } @@ -2765,7 +3414,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( Opc = AArch64::MADDXrrr; RC = &AArch64::GPR64RegClass; } - MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); break; case MachineCombinerPattern::MULADDW_OP2: case MachineCombinerPattern::MULADDX_OP2: @@ -2780,7 +3429,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( Opc = AArch64::MADDXrrr; RC = &AArch64::GPR64RegClass; } - MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); break; case MachineCombinerPattern::MULADDWI_OP1: case MachineCombinerPattern::MULADDXI_OP1: { @@ -2872,7 +3521,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( Opc = AArch64::MSUBXrrr; RC = &AArch64::GPR64RegClass; } - MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); break; case MachineCombinerPattern::MULSUBWI_OP1: case MachineCombinerPattern::MULSUBXI_OP1: { @@ -2917,6 +3566,234 @@ void AArch64InstrInfo::genAlternativeCodeSequence( } break; } + // Floating Point Support + case MachineCombinerPattern::FMULADDS_OP1: + case MachineCombinerPattern::FMULADDD_OP1: + // MUL I=A,B,0 + // ADD R,I,C + // ==> MADD R,A,B,C + // --- Create(MADD); + if (Pattern == MachineCombinerPattern::FMULADDS_OP1) { + Opc = AArch64::FMADDSrrr; + RC = &AArch64::FPR32RegClass; + } else { + Opc = AArch64::FMADDDrrr; + RC = &AArch64::FPR64RegClass; + } + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::FMULADDS_OP2: + case MachineCombinerPattern::FMULADDD_OP2: + // FMUL I=A,B,0 + // FADD R,C,I + // ==> FMADD R,A,B,C + // --- Create(FMADD); + if (Pattern == MachineCombinerPattern::FMULADDS_OP2) { + Opc = AArch64::FMADDSrrr; + RC = &AArch64::FPR32RegClass; + } else { + Opc = AArch64::FMADDDrrr; + RC = &AArch64::FPR64RegClass; + } + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + + case MachineCombinerPattern::FMLAv1i32_indexed_OP1: + Opc = AArch64::FMLAv1i32_indexed; + RC = &AArch64::FPR32RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed); + break; + case MachineCombinerPattern::FMLAv1i32_indexed_OP2: + Opc = AArch64::FMLAv1i32_indexed; + RC = &AArch64::FPR32RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + break; + + case MachineCombinerPattern::FMLAv1i64_indexed_OP1: + Opc = AArch64::FMLAv1i64_indexed; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed); + break; + case MachineCombinerPattern::FMLAv1i64_indexed_OP2: + Opc = AArch64::FMLAv1i64_indexed; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + break; + + case MachineCombinerPattern::FMLAv2i32_indexed_OP1: + case MachineCombinerPattern::FMLAv2f32_OP1: + RC = &AArch64::FPR64RegClass; + if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) { + Opc = AArch64::FMLAv2i32_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLAv2f32; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Accumulator); + } + break; + case MachineCombinerPattern::FMLAv2i32_indexed_OP2: + case MachineCombinerPattern::FMLAv2f32_OP2: + RC = &AArch64::FPR64RegClass; + if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) { + Opc = AArch64::FMLAv2i32_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLAv2f32; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + } + break; + + case MachineCombinerPattern::FMLAv2i64_indexed_OP1: + case MachineCombinerPattern::FMLAv2f64_OP1: + RC = &AArch64::FPR128RegClass; + if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) { + Opc = AArch64::FMLAv2i64_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLAv2f64; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Accumulator); + } + break; + case MachineCombinerPattern::FMLAv2i64_indexed_OP2: + case MachineCombinerPattern::FMLAv2f64_OP2: + RC = &AArch64::FPR128RegClass; + if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) { + Opc = AArch64::FMLAv2i64_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLAv2f64; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + } + break; + + case MachineCombinerPattern::FMLAv4i32_indexed_OP1: + case MachineCombinerPattern::FMLAv4f32_OP1: + RC = &AArch64::FPR128RegClass; + if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) { + Opc = AArch64::FMLAv4i32_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLAv4f32; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Accumulator); + } + break; + + case MachineCombinerPattern::FMLAv4i32_indexed_OP2: + case MachineCombinerPattern::FMLAv4f32_OP2: + RC = &AArch64::FPR128RegClass; + if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) { + Opc = AArch64::FMLAv4i32_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLAv4f32; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + } + break; + + case MachineCombinerPattern::FMULSUBS_OP1: + case MachineCombinerPattern::FMULSUBD_OP1: { + // FMUL I=A,B,0 + // FSUB R,I,C + // ==> FNMSUB R,A,B,C // = -C + A*B + // --- Create(FNMSUB); + if (Pattern == MachineCombinerPattern::FMULSUBS_OP1) { + Opc = AArch64::FNMSUBSrrr; + RC = &AArch64::FPR32RegClass; + } else { + Opc = AArch64::FNMSUBDrrr; + RC = &AArch64::FPR64RegClass; + } + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + } + case MachineCombinerPattern::FMULSUBS_OP2: + case MachineCombinerPattern::FMULSUBD_OP2: { + // FMUL I=A,B,0 + // FSUB R,C,I + // ==> FMSUB R,A,B,C (computes C - A*B) + // --- Create(FMSUB); + if (Pattern == MachineCombinerPattern::FMULSUBS_OP2) { + Opc = AArch64::FMSUBSrrr; + RC = &AArch64::FPR32RegClass; + } else { + Opc = AArch64::FMSUBDrrr; + RC = &AArch64::FPR64RegClass; + } + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + + case MachineCombinerPattern::FMLSv1i32_indexed_OP2: + Opc = AArch64::FMLSv1i32_indexed; + RC = &AArch64::FPR32RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + break; + + case MachineCombinerPattern::FMLSv1i64_indexed_OP2: + Opc = AArch64::FMLSv1i64_indexed; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + break; + + case MachineCombinerPattern::FMLSv2f32_OP2: + case MachineCombinerPattern::FMLSv2i32_indexed_OP2: + RC = &AArch64::FPR64RegClass; + if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) { + Opc = AArch64::FMLSv2i32_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLSv2f32; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + } + break; + + case MachineCombinerPattern::FMLSv2f64_OP2: + case MachineCombinerPattern::FMLSv2i64_indexed_OP2: + RC = &AArch64::FPR128RegClass; + if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) { + Opc = AArch64::FMLSv2i64_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLSv2f64; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + } + break; + + case MachineCombinerPattern::FMLSv4f32_OP2: + case MachineCombinerPattern::FMLSv4i32_indexed_OP2: + RC = &AArch64::FPR128RegClass; + if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) { + Opc = AArch64::FMLSv4i32_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLSv4f32; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + } + break; + } } // end switch (Pattern) // Record MUL and ADD/SUB for deletion DelInstrs.push_back(MUL); @@ -2940,14 +3817,23 @@ void AArch64InstrInfo::genAlternativeCodeSequence( /// to /// b.<condition code> /// +/// Replace compare and branch sequence by TBZ/TBNZ instruction when the +/// compare's constant operand is power of 2. +/// +/// Examples: +/// and w8, w8, #0x400 +/// cbnz w8, L1 +/// to +/// tbnz w8, #10, L1 +/// /// \param MI Conditional Branch /// \return True when the simple conditional branch is generated /// -bool AArch64InstrInfo::optimizeCondBranch(MachineInstr *MI) const { +bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const { bool IsNegativeBranch = false; bool IsTestAndBranch = false; unsigned TargetBBInMI = 0; - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: llvm_unreachable("Unknown branch instruction?"); case AArch64::Bcc: @@ -2976,48 +3862,108 @@ bool AArch64InstrInfo::optimizeCondBranch(MachineInstr *MI) const { // So we increment a zero register and test for bits other // than bit 0? Conservatively bail out in case the verifier // missed this case. - if (IsTestAndBranch && MI->getOperand(1).getImm()) + if (IsTestAndBranch && MI.getOperand(1).getImm()) return false; // Find Definition. - assert(MI->getParent() && "Incomplete machine instruciton\n"); - MachineBasicBlock *MBB = MI->getParent(); + assert(MI.getParent() && "Incomplete machine instruciton\n"); + MachineBasicBlock *MBB = MI.getParent(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); - unsigned VReg = MI->getOperand(0).getReg(); + unsigned VReg = MI.getOperand(0).getReg(); if (!TargetRegisterInfo::isVirtualRegister(VReg)) return false; MachineInstr *DefMI = MRI->getVRegDef(VReg); - // Look for CSINC - if (!(DefMI->getOpcode() == AArch64::CSINCWr && - DefMI->getOperand(1).getReg() == AArch64::WZR && - DefMI->getOperand(2).getReg() == AArch64::WZR) && - !(DefMI->getOpcode() == AArch64::CSINCXr && - DefMI->getOperand(1).getReg() == AArch64::XZR && - DefMI->getOperand(2).getReg() == AArch64::XZR)) - return false; + // Look through COPY instructions to find definition. + while (DefMI->isCopy()) { + unsigned CopyVReg = DefMI->getOperand(1).getReg(); + if (!MRI->hasOneNonDBGUse(CopyVReg)) + return false; + if (!MRI->hasOneDef(CopyVReg)) + return false; + DefMI = MRI->getVRegDef(CopyVReg); + } - if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) + switch (DefMI->getOpcode()) { + default: return false; + // Fold AND into a TBZ/TBNZ if constant operand is power of 2. + case AArch64::ANDWri: + case AArch64::ANDXri: { + if (IsTestAndBranch) + return false; + if (DefMI->getParent() != MBB) + return false; + if (!MRI->hasOneNonDBGUse(VReg)) + return false; - AArch64CC::CondCode CC = - (AArch64CC::CondCode)DefMI->getOperand(3).getImm(); - bool CheckOnlyCCWrites = true; - // Convert only when the condition code is not modified between - // the CSINC and the branch. The CC may be used by other - // instructions in between. - if (modifiesConditionCode(DefMI, MI, CheckOnlyCCWrites, &getRegisterInfo())) - return false; - MachineBasicBlock &RefToMBB = *MBB; - MachineBasicBlock *TBB = MI->getOperand(TargetBBInMI).getMBB(); - DebugLoc DL = MI->getDebugLoc(); - if (IsNegativeBranch) - CC = AArch64CC::getInvertedCondCode(CC); - BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB); - MI->eraseFromParent(); - return true; + bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri); + uint64_t Mask = AArch64_AM::decodeLogicalImmediate( + DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64); + if (!isPowerOf2_64(Mask)) + return false; + + MachineOperand &MO = DefMI->getOperand(1); + unsigned NewReg = MO.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(NewReg)) + return false; + + assert(!MRI->def_empty(NewReg) && "Register must be defined."); + + MachineBasicBlock &RefToMBB = *MBB; + MachineBasicBlock *TBB = MI.getOperand(1).getMBB(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Imm = Log2_64(Mask); + unsigned Opc = (Imm < 32) + ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW) + : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX); + MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc)) + .addReg(NewReg) + .addImm(Imm) + .addMBB(TBB); + // Register lives on to the CBZ now. + MO.setIsKill(false); + + // For immediate smaller than 32, we need to use the 32-bit + // variant (W) in all cases. Indeed the 64-bit variant does not + // allow to encode them. + // Therefore, if the input register is 64-bit, we need to take the + // 32-bit sub-part. + if (!Is32Bit && Imm < 32) + NewMI->getOperand(0).setSubReg(AArch64::sub_32); + MI.eraseFromParent(); + return true; + } + // Look for CSINC + case AArch64::CSINCWr: + case AArch64::CSINCXr: { + if (!(DefMI->getOperand(1).getReg() == AArch64::WZR && + DefMI->getOperand(2).getReg() == AArch64::WZR) && + !(DefMI->getOperand(1).getReg() == AArch64::XZR && + DefMI->getOperand(2).getReg() == AArch64::XZR)) + return false; + + if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) + return false; + + AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm(); + // Convert only when the condition code is not modified between + // the CSINC and the branch. The CC may be used by other + // instructions in between. + if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write)) + return false; + MachineBasicBlock &RefToMBB = *MBB; + MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB(); + DebugLoc DL = MI.getDebugLoc(); + if (IsNegativeBranch) + CC = AArch64CC::getInvertedCondCode(CC); + BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB); + MI.eraseFromParent(); + return true; + } + } } std::pair<unsigned, unsigned> @@ -3046,7 +3992,6 @@ AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { static const std::pair<unsigned, const char *> TargetFlags[] = { {MO_GOT, "aarch64-got"}, {MO_NC, "aarch64-nc"}, - {MO_TLS, "aarch64-tls"}, - {MO_CONSTPOOL, "aarch64-constant-pool"}}; + {MO_TLS, "aarch64-tls"}}; return makeArrayRef(TargetFlags); } diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h index b5bb446f8c167..24bc0e6397477 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.h +++ b/lib/Target/AArch64/AArch64InstrInfo.h @@ -28,12 +28,6 @@ class AArch64Subtarget; class AArch64TargetMachine; class AArch64InstrInfo : public AArch64GenInstrInfo { - // Reserve bits in the MachineMemOperand target hint flags, starting at 1. - // They will be shifted into MOTargetHintStart when accessed. - enum TargetMemOperandFlags { - MOSuppressPair = 1 - }; - const AArch64RegisterInfo RI; const AArch64Subtarget &Subtarget; @@ -45,76 +39,88 @@ public: /// always be able to get register info as well (through this method). const AArch64RegisterInfo &getRegisterInfo() const { return RI; } - unsigned GetInstSizeInBytes(const MachineInstr *MI) const; + unsigned GetInstSizeInBytes(const MachineInstr &MI) const; - bool isAsCheapAsAMove(const MachineInstr *MI) const override; + bool isAsCheapAsAMove(const MachineInstr &MI) const override; bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, unsigned &DstReg, unsigned &SubIdx) const override; bool - areMemAccessesTriviallyDisjoint(MachineInstr *MIa, MachineInstr *MIb, + areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA = nullptr) const override; - unsigned isLoadFromStackSlot(const MachineInstr *MI, + unsigned isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override; - unsigned isStoreToStackSlot(const MachineInstr *MI, + unsigned isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override; /// Returns true if there is a shiftable register and that the shift value /// is non-zero. - bool hasShiftedReg(const MachineInstr *MI) const; + bool hasShiftedReg(const MachineInstr &MI) const; /// Returns true if there is an extendable register and that the extending /// value is non-zero. - bool hasExtendedReg(const MachineInstr *MI) const; + bool hasExtendedReg(const MachineInstr &MI) const; /// \brief Does this instruction set its full destination register to zero? - bool isGPRZero(const MachineInstr *MI) const; + bool isGPRZero(const MachineInstr &MI) const; /// \brief Does this instruction rename a GPR without modifying bits? - bool isGPRCopy(const MachineInstr *MI) const; + bool isGPRCopy(const MachineInstr &MI) const; /// \brief Does this instruction rename an FPR without modifying bits? - bool isFPRCopy(const MachineInstr *MI) const; + bool isFPRCopy(const MachineInstr &MI) const; /// Return true if this is load/store scales or extends its register offset. /// This refers to scaling a dynamic index as opposed to scaled immediates. /// MI should be a memory op that allows scaled addressing. - bool isScaledAddr(const MachineInstr *MI) const; + bool isScaledAddr(const MachineInstr &MI) const; /// Return true if pairing the given load or store is hinted to be /// unprofitable. - bool isLdStPairSuppressed(const MachineInstr *MI) const; + bool isLdStPairSuppressed(const MachineInstr &MI) const; + + /// Return true if this is an unscaled load/store. + bool isUnscaledLdSt(unsigned Opc) const; + + /// Return true if this is an unscaled load/store. + bool isUnscaledLdSt(MachineInstr &MI) const; + + /// Return true if this is a load/store that can be potentially paired/merged. + bool isCandidateToMergeOrPair(MachineInstr &MI) const; /// Hint that pairing the given load or store is unprofitable. - void suppressLdStPair(MachineInstr *MI) const; + void suppressLdStPair(MachineInstr &MI) const; - bool getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, - unsigned &Offset, + bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, + int64_t &Offset, const TargetRegisterInfo *TRI) const override; - bool getMemOpBaseRegImmOfsWidth(MachineInstr *LdSt, unsigned &BaseReg, - int &Offset, int &Width, + bool getMemOpBaseRegImmOfsWidth(MachineInstr &LdSt, unsigned &BaseReg, + int64_t &Offset, unsigned &Width, const TargetRegisterInfo *TRI) const; bool enableClusterLoads() const override { return true; } - bool shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt, - unsigned NumLoads) const override; + bool enableClusterStores() const override { return true; } + + bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt, + unsigned NumLoads) const override; - bool shouldScheduleAdjacent(MachineInstr *First, - MachineInstr *Second) const override; + bool shouldScheduleAdjacent(MachineInstr &First, + MachineInstr &Second) const override; MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var, - const MDNode *Expr, DebugLoc DL) const; + const MDNode *Expr, + const DebugLoc &DL) const; void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - DebugLoc DL, unsigned DestReg, unsigned SrcReg, + const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode, llvm::ArrayRef<unsigned> Indices) const; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - DebugLoc DL, unsigned DestReg, unsigned SrcReg, + const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const override; void storeRegToStackSlot(MachineBasicBlock &MBB, @@ -129,40 +135,47 @@ public: const TargetRegisterInfo *TRI) const override; using TargetInstrInfo::foldMemoryOperandImpl; - MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - ArrayRef<unsigned> Ops, - MachineBasicBlock::iterator InsertPt, - int FrameIndex) const override; + MachineInstr * + foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, + ArrayRef<unsigned> Ops, + MachineBasicBlock::iterator InsertPt, int FrameIndex, + LiveIntervals *LIS = nullptr) const override; - bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl<MachineOperand> &Cond, bool AllowModify = false) const override; unsigned RemoveBranch(MachineBasicBlock &MBB) const override; unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, - DebugLoc DL) const override; + const DebugLoc &DL) const override; bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond, unsigned, unsigned, int &, int &, int &) const override; void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - DebugLoc DL, unsigned DstReg, ArrayRef<MachineOperand> Cond, - unsigned TrueReg, unsigned FalseReg) const override; + const DebugLoc &DL, unsigned DstReg, + ArrayRef<MachineOperand> Cond, unsigned TrueReg, + unsigned FalseReg) const override; void getNoopForMachoTarget(MCInst &NopInst) const override; /// analyzeCompare - For a comparison instruction, return the source registers /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. /// Return true if the comparison instruction can be analyzed. - bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, + bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, unsigned &SrcReg2, int &CmpMask, int &CmpValue) const override; /// optimizeCompareInstr - Convert the instruction supplying the argument to /// the comparison into one that sets the zero bit in the flags register. - bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, + bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask, int CmpValue, const MachineRegisterInfo *MRI) const override; - bool optimizeCondBranch(MachineInstr *MI) const override; + bool optimizeCondBranch(MachineInstr &MI) const override; + + /// Return true when a code sequence can improve throughput. It + /// should be called only for instructions in loops. + /// \param Pattern - combiner pattern + bool isThroughputPattern(MachineCombinerPattern Pattern) const override; /// Return true when there is potentially a faster code sequence /// for an instruction chain ending in <Root>. All potential patterns are /// listed in the <Patterns> array. @@ -179,10 +192,10 @@ public: SmallVectorImpl<MachineInstr *> &InsInstrs, SmallVectorImpl<MachineInstr *> &DelInstrs, DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override; - /// useMachineCombiner - AArch64 supports MachineCombiner + /// AArch64 supports MachineCombiner. bool useMachineCombiner() const override; - bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; + bool expandPostRAPseudo(MachineInstr &MI) const override; std::pair<unsigned, unsigned> decomposeMachineOperandsTargetFlags(unsigned TF) const override; @@ -192,9 +205,11 @@ public: getSerializableBitmaskMachineOperandTargetFlags() const override; private: - void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL, + void instantiateCondBranch(MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB, ArrayRef<MachineOperand> Cond) const; + bool substituteCmpToZero(MachineInstr &CmpInstr, unsigned SrcReg, + const MachineRegisterInfo *MRI) const; }; /// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg @@ -202,8 +217,8 @@ private: /// insertion (PEI) pass, where a virtual scratch register may be allocated /// if necessary, to be replaced by the scavenger at the end of PEI. void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - DebugLoc DL, unsigned DestReg, unsigned SrcReg, int Offset, - const TargetInstrInfo *TII, + const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, + int Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag = MachineInstr::NoFlags, bool SetNZCV = false); diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index d02bc9ff394d3..af9ed812e6da3 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -26,6 +26,8 @@ def HasCrypto : Predicate<"Subtarget->hasCrypto()">, AssemblerPredicate<"FeatureCrypto", "crypto">; def HasCRC : Predicate<"Subtarget->hasCRC()">, AssemblerPredicate<"FeatureCRC", "crc">; +def HasRAS : Predicate<"Subtarget->hasRAS()">, + AssemblerPredicate<"FeatureRAS", "ras">; def HasPerfMon : Predicate<"Subtarget->hasPerfMon()">; def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, AssemblerPredicate<"FeatureFullFP16", "fullfp16">; @@ -34,7 +36,8 @@ def HasSPE : Predicate<"Subtarget->hasSPE()">, def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; -def IsCyclone : Predicate<"Subtarget->isCyclone()">; +def UseAlternateSExtLoadCVTF32 + : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">; //===----------------------------------------------------------------------===// // AArch64-specific DAG Nodes. @@ -283,6 +286,9 @@ def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>; def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>; +def AArch64frecpe : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>; +def AArch64frsqrte : SDNode<"AArch64ISD::FRSQRTE", SDTFPUnaryOp>; + def AArch64saddv : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>; def AArch64uaddv : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>; def AArch64sminv : SDNode<"AArch64ISD::SMINV", SDT_AArch64UnaryVec>; @@ -295,9 +301,6 @@ def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>; //===----------------------------------------------------------------------===// // AArch64 Instruction Predicate Definitions. -// -def HasZCZ : Predicate<"Subtarget->hasZeroCycleZeroing()">; -def NoZCZ : Predicate<"!Subtarget->hasZeroCycleZeroing()">; def IsDarwin : Predicate<"Subtarget->isTargetDarwin()">; def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">; def ForCodeSize : Predicate<"ForCodeSize">; @@ -312,10 +315,13 @@ include "AArch64InstrFormats.td" //===----------------------------------------------------------------------===// let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in { +// We set Sched to empty list because we expect these instructions to simply get +// removed in most cases. def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt), - [(AArch64callseq_start timm:$amt)]>; + [(AArch64callseq_start timm:$amt)]>, Sched<[]>; def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), - [(AArch64callseq_end timm:$amt1, timm:$amt2)]>; + [(AArch64callseq_end timm:$amt1, timm:$amt2)]>, + Sched<[]>; } // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 let isReMaterializable = 1, isCodeGenOnly = 1 in { @@ -383,6 +389,7 @@ def : InstAlias<"wfe", (HINT 0b010)>; def : InstAlias<"wfi", (HINT 0b011)>; def : InstAlias<"sev", (HINT 0b100)>; def : InstAlias<"sevl", (HINT 0b101)>; +def : InstAlias<"esb", (HINT 0b10000)>, Requires<[HasRAS]>; // v8.2a Statistical Profiling extension def : InstAlias<"psb $op", (HINT psbhint_op:$op)>, Requires<[HasSPE]>; @@ -528,6 +535,12 @@ def i64imm_32bit : ImmLeaf<i64, [{ return (Imm & 0xffffffffULL) == static_cast<uint64_t>(Imm); }]>; +def s64imm_32bit : ImmLeaf<i64, [{ + int64_t Imm64 = static_cast<int64_t>(Imm); + return Imm64 >= std::numeric_limits<int32_t>::min() && + Imm64 <= std::numeric_limits<int32_t>::max(); +}]>; + def trunc_imm : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i32); }]>; @@ -679,10 +692,11 @@ def : InstAlias<"negs $dst, $src$shift", // Unsigned/Signed divide defm UDIV : Div<0, "udiv", udiv>; defm SDIV : Div<1, "sdiv", sdiv>; -let isCodeGenOnly = 1 in { -defm UDIV_Int : Div<0, "udiv", int_aarch64_udiv>; -defm SDIV_Int : Div<1, "sdiv", int_aarch64_sdiv>; -} + +def : Pat<(int_aarch64_udiv GPR32:$Rn, GPR32:$Rm), (UDIVWr $Rn, $Rm)>; +def : Pat<(int_aarch64_udiv GPR64:$Rn, GPR64:$Rm), (UDIVXr $Rn, $Rm)>; +def : Pat<(int_aarch64_sdiv GPR32:$Rn, GPR32:$Rm), (SDIVWr $Rn, $Rm)>; +def : Pat<(int_aarch64_sdiv GPR64:$Rn, GPR64:$Rm), (SDIVXr $Rn, $Rm)>; // Variable shift defm ASRV : Shift<0b10, "asr", sra>; @@ -734,6 +748,40 @@ def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (sext GPR32:$Rm)))), (SMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>; def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (zext GPR32:$Rm)))), (UMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>; + +def : Pat<(i64 (mul (sext GPR32:$Rn), (s64imm_32bit:$C))), + (SMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>; +def : Pat<(i64 (mul (zext GPR32:$Rn), (i64imm_32bit:$C))), + (UMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>; +def : Pat<(i64 (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C))), + (SMADDLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)), + (MOVi32imm (trunc_imm imm:$C)), XZR)>; + +def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (s64imm_32bit:$C)))), + (SMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>; +def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (i64imm_32bit:$C)))), + (UMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>; +def : Pat<(i64 (ineg (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C)))), + (SMSUBLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)), + (MOVi32imm (trunc_imm imm:$C)), XZR)>; + +def : Pat<(i64 (add (mul (sext GPR32:$Rn), (s64imm_32bit:$C)), GPR64:$Ra)), + (SMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>; +def : Pat<(i64 (add (mul (zext GPR32:$Rn), (i64imm_32bit:$C)), GPR64:$Ra)), + (UMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>; +def : Pat<(i64 (add (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C)), + GPR64:$Ra)), + (SMADDLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)), + (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>; + +def : Pat<(i64 (sub GPR64:$Ra, (mul (sext GPR32:$Rn), (s64imm_32bit:$C)))), + (SMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>; +def : Pat<(i64 (sub GPR64:$Ra, (mul (zext GPR32:$Rn), (i64imm_32bit:$C)))), + (UMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>; +def : Pat<(i64 (sub GPR64:$Ra, (mul (sext_inreg GPR64:$Rn, i32), + (s64imm_32bit:$C)))), + (SMSUBLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)), + (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>; } // AddedComplexity = 5 def : MulAccumWAlias<"mul", MADDWrrr>; @@ -1089,6 +1137,14 @@ def : Pat<(AArch64csel (i32 0), (i32 -1), (i32 imm:$cc), NZCV), (CSINVWr WZR, WZR, (i32 imm:$cc))>; def : Pat<(AArch64csel (i64 0), (i64 -1), (i32 imm:$cc), NZCV), (CSINVXr XZR, XZR, (i32 imm:$cc))>; +def : Pat<(AArch64csel GPR32:$tval, (i32 -1), (i32 imm:$cc), NZCV), + (CSINVWr GPR32:$tval, WZR, (i32 imm:$cc))>; +def : Pat<(AArch64csel GPR64:$tval, (i64 -1), (i32 imm:$cc), NZCV), + (CSINVXr GPR64:$tval, XZR, (i32 imm:$cc))>; +def : Pat<(AArch64csel (i32 -1), GPR32:$fval, (i32 imm:$cc), NZCV), + (CSINVWr GPR32:$fval, WZR, (i32 (inv_cond_XFORM imm:$cc)))>; +def : Pat<(AArch64csel (i64 -1), GPR64:$fval, (i32 imm:$cc), NZCV), + (CSINVXr GPR64:$fval, XZR, (i32 (inv_cond_XFORM imm:$cc)))>; // The inverse of the condition code from the alias instruction is what is used // in the aliased instruction. The parser all ready inverts the condition code @@ -1158,7 +1214,8 @@ def BR : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>; // Create a separate pseudo-instruction for codegen to use so that we don't // flag lr as used in every function. It'll be restored before the RET by the // epilogue if it's legitimately used. -def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]> { +def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]>, + Sched<[WriteBrReg]> { let isTerminator = 1; let isBarrier = 1; let isReturn = 1; @@ -1168,7 +1225,7 @@ def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]> { // R_AARCH64_TLSDESC_CALL relocation at the offset of the following instruction // (which in the usual case is a BLR). let hasSideEffects = 1 in -def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []> { +def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []>, Sched<[]> { let AsmString = ".tlsdesccall $sym"; } @@ -1178,7 +1235,8 @@ let isCall = 1, Defs = [LR, X0, X1], hasSideEffects = 1, isCodeGenOnly = 1 in def TLSDESC_CALLSEQ : Pseudo<(outs), (ins i64imm:$sym), - [(AArch64tlsdesc_callseq tglobaltlsaddr:$sym)]>; + [(AArch64tlsdesc_callseq tglobaltlsaddr:$sym)]>, + Sched<[WriteI, WriteLD, WriteI, WriteBrReg]>; def : Pat<(AArch64tlsdesc_callseq texternalsym:$sym), (TLSDESC_CALLSEQ texternalsym:$sym)>; @@ -2444,13 +2502,32 @@ defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>; defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>; defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>; defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>; -let isCodeGenOnly = 1 in { -defm FCVTZS_Int : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>; -defm FCVTZU_Int : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>; -defm FCVTZS_Int : FPToIntegerScaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>; -defm FCVTZU_Int : FPToIntegerScaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>; + +multiclass FPToIntegerIntPats<Intrinsic round, string INST> { + def : Pat<(i32 (round f16:$Rn)), (!cast<Instruction>(INST # UWHr) $Rn)>; + def : Pat<(i64 (round f16:$Rn)), (!cast<Instruction>(INST # UXHr) $Rn)>; + def : Pat<(i32 (round f32:$Rn)), (!cast<Instruction>(INST # UWSr) $Rn)>; + def : Pat<(i64 (round f32:$Rn)), (!cast<Instruction>(INST # UXSr) $Rn)>; + def : Pat<(i32 (round f64:$Rn)), (!cast<Instruction>(INST # UWDr) $Rn)>; + def : Pat<(i64 (round f64:$Rn)), (!cast<Instruction>(INST # UXDr) $Rn)>; + + def : Pat<(i32 (round (fmul f16:$Rn, fixedpoint_f16_i32:$scale))), + (!cast<Instruction>(INST # SWHri) $Rn, $scale)>; + def : Pat<(i64 (round (fmul f16:$Rn, fixedpoint_f16_i64:$scale))), + (!cast<Instruction>(INST # SXHri) $Rn, $scale)>; + def : Pat<(i32 (round (fmul f32:$Rn, fixedpoint_f32_i32:$scale))), + (!cast<Instruction>(INST # SWSri) $Rn, $scale)>; + def : Pat<(i64 (round (fmul f32:$Rn, fixedpoint_f32_i64:$scale))), + (!cast<Instruction>(INST # SXSri) $Rn, $scale)>; + def : Pat<(i32 (round (fmul f64:$Rn, fixedpoint_f64_i32:$scale))), + (!cast<Instruction>(INST # SWDri) $Rn, $scale)>; + def : Pat<(i64 (round (fmul f64:$Rn, fixedpoint_f64_i64:$scale))), + (!cast<Instruction>(INST # SXDri) $Rn, $scale)>; } +defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzs, "FCVTZS">; +defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzu, "FCVTZU">; + multiclass FPToIntegerPats<SDNode to_int, SDNode round, string INST> { def : Pat<(i32 (to_int (round f32:$Rn))), (!cast<Instruction>(INST # UWSr) f32:$Rn)>; @@ -2485,13 +2562,11 @@ defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>; defm FMOV : UnscaledConversion<"fmov">; // Add pseudo ops for FMOV 0 so we can mark them as isReMaterializable -let isReMaterializable = 1, isCodeGenOnly = 1 in { +let isReMaterializable = 1, isCodeGenOnly = 1, isAsCheapAsAMove = 1 in { def FMOVS0 : Pseudo<(outs FPR32:$Rd), (ins), [(set f32:$Rd, (fpimm0))]>, - PseudoInstExpansion<(FMOVWSr FPR32:$Rd, WZR)>, - Requires<[NoZCZ]>; + Sched<[WriteF]>; def FMOVD0 : Pseudo<(outs FPR64:$Rd), (ins), [(set f64:$Rd, (fpimm0))]>, - PseudoInstExpansion<(FMOVXDr FPR64:$Rd, XZR)>, - Requires<[NoZCZ]>; + Sched<[WriteF]>; } //===----------------------------------------------------------------------===// @@ -2617,6 +2692,7 @@ def F128CSEL : Pseudo<(outs FPR128:$Rd), (i32 imm:$cond), NZCV))]> { let Uses = [NZCV]; let usesCustomInserter = 1; + let hasNoSchedulingInfo = 1; } @@ -2742,12 +2818,19 @@ defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn", int_aarch64_neon_fcvtxn>; defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>; defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>; -let isCodeGenOnly = 1 in { -defm FCVTZS_Int : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", - int_aarch64_neon_fcvtzs>; -defm FCVTZU_Int : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", - int_aarch64_neon_fcvtzu>; -} + +def : Pat<(v4i16 (int_aarch64_neon_fcvtzs v4f16:$Rn)), (FCVTZSv4f16 $Rn)>; +def : Pat<(v8i16 (int_aarch64_neon_fcvtzs v8f16:$Rn)), (FCVTZSv8f16 $Rn)>; +def : Pat<(v2i32 (int_aarch64_neon_fcvtzs v2f32:$Rn)), (FCVTZSv2f32 $Rn)>; +def : Pat<(v4i32 (int_aarch64_neon_fcvtzs v4f32:$Rn)), (FCVTZSv4f32 $Rn)>; +def : Pat<(v2i64 (int_aarch64_neon_fcvtzs v2f64:$Rn)), (FCVTZSv2f64 $Rn)>; + +def : Pat<(v4i16 (int_aarch64_neon_fcvtzu v4f16:$Rn)), (FCVTZUv4f16 $Rn)>; +def : Pat<(v8i16 (int_aarch64_neon_fcvtzu v8f16:$Rn)), (FCVTZUv8f16 $Rn)>; +def : Pat<(v2i32 (int_aarch64_neon_fcvtzu v2f32:$Rn)), (FCVTZUv2f32 $Rn)>; +def : Pat<(v4i32 (int_aarch64_neon_fcvtzu v4f32:$Rn)), (FCVTZUv4f32 $Rn)>; +def : Pat<(v2i64 (int_aarch64_neon_fcvtzu v2f64:$Rn)), (FCVTZUv2f64 $Rn)>; + defm FNEG : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>; defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_aarch64_neon_frecpe>; defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", frnd>; @@ -3318,6 +3401,19 @@ def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))), def : Pat<(v1f64 (int_aarch64_neon_frecpe (v1f64 FPR64:$Rn))), (FRECPEv1i64 FPR64:$Rn)>; +def : Pat<(f32 (AArch64frecpe (f32 FPR32:$Rn))), + (FRECPEv1i32 FPR32:$Rn)>; +def : Pat<(v2f32 (AArch64frecpe (v2f32 V64:$Rn))), + (FRECPEv2f32 V64:$Rn)>; +def : Pat<(v4f32 (AArch64frecpe (v4f32 FPR128:$Rn))), + (FRECPEv4f32 FPR128:$Rn)>; +def : Pat<(f64 (AArch64frecpe (f64 FPR64:$Rn))), + (FRECPEv1i64 FPR64:$Rn)>; +def : Pat<(v1f64 (AArch64frecpe (v1f64 FPR64:$Rn))), + (FRECPEv1i64 FPR64:$Rn)>; +def : Pat<(v2f64 (AArch64frecpe (v2f64 FPR128:$Rn))), + (FRECPEv2f64 FPR128:$Rn)>; + def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))), (FRECPXv1i32 FPR32:$Rn)>; def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))), @@ -3330,6 +3426,19 @@ def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))), def : Pat<(v1f64 (int_aarch64_neon_frsqrte (v1f64 FPR64:$Rn))), (FRSQRTEv1i64 FPR64:$Rn)>; +def : Pat<(f32 (AArch64frsqrte (f32 FPR32:$Rn))), + (FRSQRTEv1i32 FPR32:$Rn)>; +def : Pat<(v2f32 (AArch64frsqrte (v2f32 V64:$Rn))), + (FRSQRTEv2f32 V64:$Rn)>; +def : Pat<(v4f32 (AArch64frsqrte (v4f32 FPR128:$Rn))), + (FRSQRTEv4f32 FPR128:$Rn)>; +def : Pat<(f64 (AArch64frsqrte (f64 FPR64:$Rn))), + (FRSQRTEv1i64 FPR64:$Rn)>; +def : Pat<(v1f64 (AArch64frsqrte (v1f64 FPR64:$Rn))), + (FRSQRTEv1i64 FPR64:$Rn)>; +def : Pat<(v2f64 (AArch64frsqrte (v2f64 FPR128:$Rn))), + (FRSQRTEv2f64 FPR128:$Rn)>; + // If an integer is about to be converted to a floating point value, // just load it on the floating point unit. // Here are the patterns for 8 and 16-bits to float. @@ -4319,18 +4428,6 @@ def MOVIv2d_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1110, V128, "movi", ".2d", [(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>; - -// Use movi.2d to materialize 0.0 if the HW does zero-cycle zeroing. -// Complexity is added to break a tie with a plain MOVI. -let AddedComplexity = 1 in { -def : Pat<(f32 fpimm0), - (f32 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), ssub))>, - Requires<[HasZCZ]>; -def : Pat<(f64 fpimm0), - (f64 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), dsub))>, - Requires<[HasZCZ]>; -} - def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>; def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>; def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>; @@ -4845,7 +4942,8 @@ class SExtLoadi8CVTf32Pat<dag addrmode, dag INST> 0), dsub)), 0), - ssub)))>, Requires<[NotForCodeSize, IsCyclone]>; + ssub)))>, + Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>; def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext), (LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>; @@ -4898,7 +4996,8 @@ class SExtLoadi16CVTf64Pat<dag addrmode, dag INST> 0), dsub)), 0), - dsub)))>, Requires<[NotForCodeSize, IsCyclone]>; + dsub)))>, + Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>; def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext), (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>; @@ -5982,7 +6081,7 @@ def : NTStore64Pat<v8i8>; def : Pat<(nontemporalstore GPR64:$Rt, (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)), (STNPWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), - (EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 0, 31), sub_32), + (EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 32, 63), sub_32), GPR64sp:$Rn, simm7s4:$offset)>; } // AddedComplexity=10 } // Predicates = [IsLE] @@ -5990,8 +6089,10 @@ def : Pat<(nontemporalstore GPR64:$Rt, // Tail call return handling. These are all compiler pseudo-instructions, // so no encoding information or anything like that. let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in { - def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff),[]>; - def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>; + def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff), []>, + Sched<[WriteBrReg]>; + def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>, + Sched<[WriteBrReg]>; } def : Pat<(AArch64tcret tcGPR64:$dst, (i32 timm:$FPDiff)), diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 43664df3b861a..dca13fc494140 100644 --- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -33,9 +33,6 @@ using namespace llvm; #define DEBUG_TYPE "aarch64-ldst-opt" -/// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine -/// load / store instructions to form ldp / stp instructions. - STATISTIC(NumPairCreated, "Number of load/store pair instructions generated"); STATISTIC(NumPostFolded, "Number of post-index updates folded"); STATISTIC(NumPreFolded, "Number of pre-index updates folded"); @@ -45,9 +42,19 @@ STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted"); STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted"); STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted"); -static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit", +// The LdStLimit limits how far we search for load/store pairs. +static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit", cl::init(20), cl::Hidden); +// The UpdateLimit limits how far we search for update instructions when we form +// pre-/post-index instructions. +static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100), + cl::Hidden); + +static cl::opt<bool> EnableNarrowLdMerge("enable-narrow-ld-merge", cl::Hidden, + cl::init(false), + cl::desc("Enable narrow load merge")); + namespace llvm { void initializeAArch64LoadStoreOptPass(PassRegistry &); } @@ -88,22 +95,29 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { const TargetRegisterInfo *TRI; const AArch64Subtarget *Subtarget; + // Track which registers have been modified and used. + BitVector ModifiedRegs, UsedRegs; + // Scan the instructions looking for a load/store that can be combined // with the current instruction into a load/store pair. // Return the matching instruction if one is found, else MBB->end(). MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, LdStPairFlags &Flags, - unsigned Limit); + unsigned Limit, + bool FindNarrowMerge); // Scan the instructions looking for a store that writes to the address from // which the current load instruction reads. Return true if one is found. bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit, MachineBasicBlock::iterator &StoreI); + // Merge the two instructions indicated into a wider instruction. + MachineBasicBlock::iterator + mergeNarrowInsns(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator MergeMI, + const LdStPairFlags &Flags); + // Merge the two instructions indicated into a single pair-wise instruction. - // If MergeForward is true, erase the first instruction and fold its - // operation into the second. If false, the reverse. Return the instruction - // following the first instruction (which may change during processing). MachineBasicBlock::iterator mergePairedInsns(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, @@ -118,8 +132,8 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { // be combined with the current instruction (a load or store) using // pre or post indexed addressing with writeback. Scan forwards. MachineBasicBlock::iterator - findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit, - int UnscaledOffset); + findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, + int UnscaledOffset, unsigned Limit); // Scan the instruction list to find a base register update that can // be combined with the current instruction (a load or store) using @@ -129,7 +143,7 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { // Find an instruction that updates the base register of the ld/st // instruction. - bool isMatchingUpdateInsn(MachineInstr *MemMI, MachineInstr *MI, + bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI, unsigned BaseReg, int Offset); // Merge a pre- or post-index base register update into a ld/st instruction. @@ -140,17 +154,21 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { // Find and merge foldable ldr/str instructions. bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI); + // Find and pair ldr/str instructions. + bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI); + // Find and promote load instructions which read directly from store. bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI); - // Check if converting two narrow loads into a single wider load with - // bitfield extracts could be enabled. - bool enableNarrowLdMerge(MachineFunction &Fn); - bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt); bool runOnMachineFunction(MachineFunction &Fn) override; + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::AllVRegsAllocated); + } + const char *getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; } @@ -161,37 +179,8 @@ char AArch64LoadStoreOpt::ID = 0; INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt", AARCH64_LOAD_STORE_OPT_NAME, false, false) -static bool isUnscaledLdSt(unsigned Opc) { - switch (Opc) { - default: - return false; - case AArch64::STURSi: - case AArch64::STURDi: - case AArch64::STURQi: - case AArch64::STURBBi: - case AArch64::STURHHi: - case AArch64::STURWi: - case AArch64::STURXi: - case AArch64::LDURSi: - case AArch64::LDURDi: - case AArch64::LDURQi: - case AArch64::LDURWi: - case AArch64::LDURXi: - case AArch64::LDURSWi: - case AArch64::LDURHHi: - case AArch64::LDURBBi: - case AArch64::LDURSBWi: - case AArch64::LDURSHWi: - return true; - } -} - -static bool isUnscaledLdSt(MachineInstr *MI) { - return isUnscaledLdSt(MI->getOpcode()); -} - -static unsigned getBitExtrOpcode(MachineInstr *MI) { - switch (MI->getOpcode()) { +static unsigned getBitExtrOpcode(MachineInstr &MI) { + switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected opcode."); case AArch64::LDRBBui: @@ -219,10 +208,6 @@ static bool isNarrowStore(unsigned Opc) { } } -static bool isNarrowStore(MachineInstr *MI) { - return isNarrowStore(MI->getOpcode()); -} - static bool isNarrowLoad(unsigned Opc) { switch (Opc) { default: @@ -239,13 +224,17 @@ static bool isNarrowLoad(unsigned Opc) { } } -static bool isNarrowLoad(MachineInstr *MI) { - return isNarrowLoad(MI->getOpcode()); +static bool isNarrowLoad(MachineInstr &MI) { + return isNarrowLoad(MI.getOpcode()); +} + +static bool isNarrowLoadOrStore(unsigned Opc) { + return isNarrowLoad(Opc) || isNarrowStore(Opc); } // Scaling factor for unscaled load or store. -static int getMemScale(MachineInstr *MI) { - switch (MI->getOpcode()) { +static int getMemScale(MachineInstr &MI) { + switch (MI.getOpcode()) { default: llvm_unreachable("Opcode has unknown scale!"); case AArch64::LDRBBui: @@ -354,6 +343,37 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc, } } +static unsigned getMatchingWideOpcode(unsigned Opc) { + switch (Opc) { + default: + llvm_unreachable("Opcode has no wide equivalent!"); + case AArch64::STRBBui: + return AArch64::STRHHui; + case AArch64::STRHHui: + return AArch64::STRWui; + case AArch64::STURBBi: + return AArch64::STURHHi; + case AArch64::STURHHi: + return AArch64::STURWi; + case AArch64::STURWi: + return AArch64::STURXi; + case AArch64::STRWui: + return AArch64::STRXui; + case AArch64::LDRHHui: + case AArch64::LDRSHWui: + return AArch64::LDRWui; + case AArch64::LDURHHi: + case AArch64::LDURSHWi: + return AArch64::LDURWi; + case AArch64::LDRBBui: + case AArch64::LDRSBWui: + return AArch64::LDRHHui; + case AArch64::LDURBBi: + case AArch64::LDURSBWi: + return AArch64::LDURHHi; + } +} + static unsigned getMatchingPairOpcode(unsigned Opc) { switch (Opc) { default: @@ -367,14 +387,6 @@ static unsigned getMatchingPairOpcode(unsigned Opc) { case AArch64::STRQui: case AArch64::STURQi: return AArch64::STPQi; - case AArch64::STRBBui: - return AArch64::STRHHui; - case AArch64::STRHHui: - return AArch64::STRWui; - case AArch64::STURBBi: - return AArch64::STURHHi; - case AArch64::STURHHi: - return AArch64::STURWi; case AArch64::STRWui: case AArch64::STURWi: return AArch64::STPWi; @@ -399,25 +411,13 @@ static unsigned getMatchingPairOpcode(unsigned Opc) { case AArch64::LDRSWui: case AArch64::LDURSWi: return AArch64::LDPSWi; - case AArch64::LDRHHui: - case AArch64::LDRSHWui: - return AArch64::LDRWui; - case AArch64::LDURHHi: - case AArch64::LDURSHWi: - return AArch64::LDURWi; - case AArch64::LDRBBui: - case AArch64::LDRSBWui: - return AArch64::LDRHHui; - case AArch64::LDURBBi: - case AArch64::LDURSBWi: - return AArch64::LDURHHi; } } -static unsigned isMatchingStore(MachineInstr *LoadInst, - MachineInstr *StoreInst) { - unsigned LdOpc = LoadInst->getOpcode(); - unsigned StOpc = StoreInst->getOpcode(); +static unsigned isMatchingStore(MachineInstr &LoadInst, + MachineInstr &StoreInst) { + unsigned LdOpc = LoadInst.getOpcode(); + unsigned StOpc = StoreInst.getOpcode(); switch (LdOpc) { default: llvm_unreachable("Unsupported load instruction!"); @@ -562,8 +562,8 @@ static unsigned getPostIndexedOpcode(unsigned Opc) { } } -static bool isPairedLdSt(const MachineInstr *MI) { - switch (MI->getOpcode()) { +static bool isPairedLdSt(const MachineInstr &MI) { + switch (MI.getOpcode()) { default: return false; case AArch64::LDPSi: @@ -581,41 +581,55 @@ static bool isPairedLdSt(const MachineInstr *MI) { } } -static const MachineOperand &getLdStRegOp(const MachineInstr *MI, +static const MachineOperand &getLdStRegOp(const MachineInstr &MI, unsigned PairedRegOp = 0) { assert(PairedRegOp < 2 && "Unexpected register operand idx."); unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0; - return MI->getOperand(Idx); + return MI.getOperand(Idx); } -static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) { +static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) { unsigned Idx = isPairedLdSt(MI) ? 2 : 1; - return MI->getOperand(Idx); + return MI.getOperand(Idx); } -static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) { +static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) { unsigned Idx = isPairedLdSt(MI) ? 3 : 2; - return MI->getOperand(Idx); + return MI.getOperand(Idx); } -static bool isLdOffsetInRangeOfSt(MachineInstr *LoadInst, - MachineInstr *StoreInst) { +static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst, + MachineInstr &StoreInst, + const AArch64InstrInfo *TII) { assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st."); int LoadSize = getMemScale(LoadInst); int StoreSize = getMemScale(StoreInst); - int UnscaledStOffset = isUnscaledLdSt(StoreInst) + int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst) ? getLdStOffsetOp(StoreInst).getImm() : getLdStOffsetOp(StoreInst).getImm() * StoreSize; - int UnscaledLdOffset = isUnscaledLdSt(LoadInst) + int UnscaledLdOffset = TII->isUnscaledLdSt(LoadInst) ? getLdStOffsetOp(LoadInst).getImm() : getLdStOffsetOp(LoadInst).getImm() * LoadSize; return (UnscaledStOffset <= UnscaledLdOffset) && (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize)); } +static bool isPromotableZeroStoreOpcode(unsigned Opc) { + return isNarrowStore(Opc) || Opc == AArch64::STRWui || Opc == AArch64::STURWi; +} + +static bool isPromotableZeroStoreOpcode(MachineInstr &MI) { + return isPromotableZeroStoreOpcode(MI.getOpcode()); +} + +static bool isPromotableZeroStoreInst(MachineInstr &MI) { + return (isPromotableZeroStoreOpcode(MI)) && + getLdStRegOp(MI).getReg() == AArch64::WZR; +} + MachineBasicBlock::iterator -AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Paired, +AArch64LoadStoreOpt::mergeNarrowInsns(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator MergeMI, const LdStPairFlags &Flags) { MachineBasicBlock::iterator NextI = I; ++NextI; @@ -623,128 +637,124 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, // to skip one further. Either way we merge will invalidate the iterator, // and we don't need to scan the new instruction, as it's a pairwise // instruction, which we're not considering for further action anyway. - if (NextI == Paired) + if (NextI == MergeMI) ++NextI; - int SExtIdx = Flags.getSExtIdx(); - unsigned Opc = - SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); - bool IsUnscaled = isUnscaledLdSt(Opc); - int OffsetStride = IsUnscaled ? getMemScale(I) : 1; + unsigned Opc = I->getOpcode(); + bool IsScaled = !TII->isUnscaledLdSt(Opc); + int OffsetStride = IsScaled ? 1 : getMemScale(*I); bool MergeForward = Flags.getMergeForward(); - unsigned NewOpc = getMatchingPairOpcode(Opc); // Insert our new paired instruction after whichever of the paired // instructions MergeForward indicates. - MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; + MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I; // Also based on MergeForward is from where we copy the base register operand // so we get the flags compatible with the input code. const MachineOperand &BaseRegOp = - MergeForward ? getLdStBaseOp(Paired) : getLdStBaseOp(I); + MergeForward ? getLdStBaseOp(*MergeMI) : getLdStBaseOp(*I); // Which register is Rt and which is Rt2 depends on the offset order. MachineInstr *RtMI, *Rt2MI; - if (getLdStOffsetOp(I).getImm() == - getLdStOffsetOp(Paired).getImm() + OffsetStride) { - RtMI = Paired; - Rt2MI = I; - // Here we swapped the assumption made for SExtIdx. - // I.e., we turn ldp I, Paired into ldp Paired, I. - // Update the index accordingly. - if (SExtIdx != -1) - SExtIdx = (SExtIdx + 1) % 2; + if (getLdStOffsetOp(*I).getImm() == + getLdStOffsetOp(*MergeMI).getImm() + OffsetStride) { + RtMI = &*MergeMI; + Rt2MI = &*I; } else { - RtMI = I; - Rt2MI = Paired; + RtMI = &*I; + Rt2MI = &*MergeMI; } - int OffsetImm = getLdStOffsetOp(RtMI).getImm(); + int OffsetImm = getLdStOffsetOp(*RtMI).getImm(); + // Change the scaled offset from small to large type. + if (IsScaled) { + assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge"); + OffsetImm /= 2; + } + DebugLoc DL = I->getDebugLoc(); + MachineBasicBlock *MBB = I->getParent(); if (isNarrowLoad(Opc)) { - // Change the scaled offset from small to large type. - if (!IsUnscaled) { - assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge"); - OffsetImm /= 2; - } - MachineInstr *RtNewDest = MergeForward ? I : Paired; + MachineInstr *RtNewDest = &*(MergeForward ? I : MergeMI); // When merging small (< 32 bit) loads for big-endian targets, the order of // the component parts gets swapped. if (!Subtarget->isLittleEndian()) std::swap(RtMI, Rt2MI); // Construct the new load instruction. MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2; - NewMemMI = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(NewOpc)) - .addOperand(getLdStRegOp(RtNewDest)) - .addOperand(BaseRegOp) - .addImm(OffsetImm) - .setMemRefs(I->mergeMemRefsWith(*Paired)); + NewMemMI = + BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc))) + .addOperand(getLdStRegOp(*RtNewDest)) + .addOperand(BaseRegOp) + .addImm(OffsetImm) + .setMemRefs(I->mergeMemRefsWith(*MergeMI)); + (void)NewMemMI; DEBUG( dbgs() << "Creating the new load and extract. Replacing instructions:\n "); DEBUG(I->print(dbgs())); DEBUG(dbgs() << " "); - DEBUG(Paired->print(dbgs())); + DEBUG(MergeMI->print(dbgs())); DEBUG(dbgs() << " with instructions:\n "); DEBUG((NewMemMI)->print(dbgs())); - int Width = getMemScale(I) == 1 ? 8 : 16; + int Width = getMemScale(*I) == 1 ? 8 : 16; int LSBLow = 0; int LSBHigh = Width; int ImmsLow = LSBLow + Width - 1; int ImmsHigh = LSBHigh + Width - 1; - MachineInstr *ExtDestMI = MergeForward ? Paired : I; + MachineInstr *ExtDestMI = &*(MergeForward ? MergeMI : I); if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) { // Create the bitfield extract for high bits. - BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(getBitExtrOpcode(Rt2MI))) - .addOperand(getLdStRegOp(Rt2MI)) - .addReg(getLdStRegOp(RtNewDest).getReg()) - .addImm(LSBHigh) - .addImm(ImmsHigh); + BitExtMI1 = + BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*Rt2MI))) + .addOperand(getLdStRegOp(*Rt2MI)) + .addReg(getLdStRegOp(*RtNewDest).getReg()) + .addImm(LSBHigh) + .addImm(ImmsHigh); // Create the bitfield extract for low bits. if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) { // For unsigned, prefer to use AND for low bits. - BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(AArch64::ANDWri)) - .addOperand(getLdStRegOp(RtMI)) - .addReg(getLdStRegOp(RtNewDest).getReg()) + BitExtMI2 = BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::ANDWri)) + .addOperand(getLdStRegOp(*RtMI)) + .addReg(getLdStRegOp(*RtNewDest).getReg()) .addImm(ImmsLow); } else { - BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(getBitExtrOpcode(RtMI))) - .addOperand(getLdStRegOp(RtMI)) - .addReg(getLdStRegOp(RtNewDest).getReg()) - .addImm(LSBLow) - .addImm(ImmsLow); + BitExtMI2 = + BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*RtMI))) + .addOperand(getLdStRegOp(*RtMI)) + .addReg(getLdStRegOp(*RtNewDest).getReg()) + .addImm(LSBLow) + .addImm(ImmsLow); } } else { // Create the bitfield extract for low bits. if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) { // For unsigned, prefer to use AND for low bits. - BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(AArch64::ANDWri)) - .addOperand(getLdStRegOp(RtMI)) - .addReg(getLdStRegOp(RtNewDest).getReg()) + BitExtMI1 = BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::ANDWri)) + .addOperand(getLdStRegOp(*RtMI)) + .addReg(getLdStRegOp(*RtNewDest).getReg()) .addImm(ImmsLow); } else { - BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(getBitExtrOpcode(RtMI))) - .addOperand(getLdStRegOp(RtMI)) - .addReg(getLdStRegOp(RtNewDest).getReg()) - .addImm(LSBLow) - .addImm(ImmsLow); + BitExtMI1 = + BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*RtMI))) + .addOperand(getLdStRegOp(*RtMI)) + .addReg(getLdStRegOp(*RtNewDest).getReg()) + .addImm(LSBLow) + .addImm(ImmsLow); } // Create the bitfield extract for high bits. - BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(getBitExtrOpcode(Rt2MI))) - .addOperand(getLdStRegOp(Rt2MI)) - .addReg(getLdStRegOp(RtNewDest).getReg()) - .addImm(LSBHigh) - .addImm(ImmsHigh); + BitExtMI2 = + BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*Rt2MI))) + .addOperand(getLdStRegOp(*Rt2MI)) + .addReg(getLdStRegOp(*RtNewDest).getReg()) + .addImm(LSBHigh) + .addImm(ImmsHigh); } + (void)BitExtMI1; + (void)BitExtMI2; + DEBUG(dbgs() << " "); DEBUG((BitExtMI1)->print(dbgs())); DEBUG(dbgs() << " "); @@ -753,47 +763,122 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, // Erase the old instructions. I->eraseFromParent(); - Paired->eraseFromParent(); + MergeMI->eraseFromParent(); return NextI; } + assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) && + "Expected promotable zero store"); // Construct the new instruction. MachineInstrBuilder MIB; - if (isNarrowStore(Opc)) { - // Change the scaled offset from small to large type. - if (!IsUnscaled) { - assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge"); - OffsetImm /= 2; + MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc))) + .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR) + .addOperand(BaseRegOp) + .addImm(OffsetImm) + .setMemRefs(I->mergeMemRefsWith(*MergeMI)); + (void)MIB; + + DEBUG(dbgs() << "Creating wider load/store. Replacing instructions:\n "); + DEBUG(I->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(MergeMI->print(dbgs())); + DEBUG(dbgs() << " with instruction:\n "); + DEBUG(((MachineInstr *)MIB)->print(dbgs())); + DEBUG(dbgs() << "\n"); + + // Erase the old instructions. + I->eraseFromParent(); + MergeMI->eraseFromParent(); + return NextI; +} + +MachineBasicBlock::iterator +AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, + const LdStPairFlags &Flags) { + MachineBasicBlock::iterator NextI = I; + ++NextI; + // If NextI is the second of the two instructions to be merged, we need + // to skip one further. Either way we merge will invalidate the iterator, + // and we don't need to scan the new instruction, as it's a pairwise + // instruction, which we're not considering for further action anyway. + if (NextI == Paired) + ++NextI; + + int SExtIdx = Flags.getSExtIdx(); + unsigned Opc = + SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); + bool IsUnscaled = TII->isUnscaledLdSt(Opc); + int OffsetStride = IsUnscaled ? getMemScale(*I) : 1; + + bool MergeForward = Flags.getMergeForward(); + // Insert our new paired instruction after whichever of the paired + // instructions MergeForward indicates. + MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; + // Also based on MergeForward is from where we copy the base register operand + // so we get the flags compatible with the input code. + const MachineOperand &BaseRegOp = + MergeForward ? getLdStBaseOp(*Paired) : getLdStBaseOp(*I); + + int Offset = getLdStOffsetOp(*I).getImm(); + int PairedOffset = getLdStOffsetOp(*Paired).getImm(); + bool PairedIsUnscaled = TII->isUnscaledLdSt(Paired->getOpcode()); + if (IsUnscaled != PairedIsUnscaled) { + // We're trying to pair instructions that differ in how they are scaled. If + // I is scaled then scale the offset of Paired accordingly. Otherwise, do + // the opposite (i.e., make Paired's offset unscaled). + int MemSize = getMemScale(*Paired); + if (PairedIsUnscaled) { + // If the unscaled offset isn't a multiple of the MemSize, we can't + // pair the operations together. + assert(!(PairedOffset % getMemScale(*Paired)) && + "Offset should be a multiple of the stride!"); + PairedOffset /= MemSize; + } else { + PairedOffset *= MemSize; } - MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(NewOpc)) - .addOperand(getLdStRegOp(I)) - .addOperand(BaseRegOp) - .addImm(OffsetImm) - .setMemRefs(I->mergeMemRefsWith(*Paired)); + } + + // Which register is Rt and which is Rt2 depends on the offset order. + MachineInstr *RtMI, *Rt2MI; + if (Offset == PairedOffset + OffsetStride) { + RtMI = &*Paired; + Rt2MI = &*I; + // Here we swapped the assumption made for SExtIdx. + // I.e., we turn ldp I, Paired into ldp Paired, I. + // Update the index accordingly. + if (SExtIdx != -1) + SExtIdx = (SExtIdx + 1) % 2; } else { - // Handle Unscaled - if (IsUnscaled) - OffsetImm /= OffsetStride; - MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(NewOpc)) - .addOperand(getLdStRegOp(RtMI)) - .addOperand(getLdStRegOp(Rt2MI)) - .addOperand(BaseRegOp) - .addImm(OffsetImm); + RtMI = &*I; + Rt2MI = &*Paired; + } + int OffsetImm = getLdStOffsetOp(*RtMI).getImm(); + // Scale the immediate offset, if necessary. + if (TII->isUnscaledLdSt(RtMI->getOpcode())) { + assert(!(OffsetImm % getMemScale(*RtMI)) && + "Unscaled offset cannot be scaled."); + OffsetImm /= getMemScale(*RtMI); } - (void)MIB; + // Construct the new instruction. + MachineInstrBuilder MIB; + DebugLoc DL = I->getDebugLoc(); + MachineBasicBlock *MBB = I->getParent(); + MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc))) + .addOperand(getLdStRegOp(*RtMI)) + .addOperand(getLdStRegOp(*Rt2MI)) + .addOperand(BaseRegOp) + .addImm(OffsetImm) + .setMemRefs(I->mergeMemRefsWith(*Paired)); - // FIXME: Do we need/want to copy the mem operands from the source - // instructions? Probably. What uses them after this? + (void)MIB; DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n "); DEBUG(I->print(dbgs())); DEBUG(dbgs() << " "); DEBUG(Paired->print(dbgs())); DEBUG(dbgs() << " with instruction:\n "); - if (SExtIdx != -1) { // Generate the sign extension for the proper result of the ldp. // I.e., with X1, that would be: @@ -814,26 +899,23 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, // Insert this definition right after the generated LDP, i.e., before // InsertionPoint. MachineInstrBuilder MIBKill = - BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(TargetOpcode::KILL), DstRegW) + BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW) .addReg(DstRegW) .addReg(DstRegX, RegState::Define); MIBKill->getOperand(2).setImplicit(); // Create the sign extension. MachineInstrBuilder MIBSXTW = - BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), - TII->get(AArch64::SBFMXri), DstRegX) + BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX) .addReg(DstRegX) .addImm(0) .addImm(31); (void)MIBSXTW; DEBUG(dbgs() << " Extend operand:\n "); DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs())); - DEBUG(dbgs() << "\n"); } else { DEBUG(((MachineInstr *)MIB)->print(dbgs())); - DEBUG(dbgs() << "\n"); } + DEBUG(dbgs() << "\n"); // Erase the old instructions. I->eraseFromParent(); @@ -848,10 +930,10 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, MachineBasicBlock::iterator NextI = LoadI; ++NextI; - int LoadSize = getMemScale(LoadI); - int StoreSize = getMemScale(StoreI); - unsigned LdRt = getLdStRegOp(LoadI).getReg(); - unsigned StRt = getLdStRegOp(StoreI).getReg(); + int LoadSize = getMemScale(*LoadI); + int StoreSize = getMemScale(*StoreI); + unsigned LdRt = getLdStRegOp(*LoadI).getReg(); + unsigned StRt = getLdStRegOp(*StoreI).getReg(); bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt); assert((IsStoreXReg || @@ -881,15 +963,16 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, // performance and correctness are verified only in little-endian. if (!Subtarget->isLittleEndian()) return NextI; - bool IsUnscaled = isUnscaledLdSt(LoadI); - assert(IsUnscaled == isUnscaledLdSt(StoreI) && "Unsupported ld/st match"); + bool IsUnscaled = TII->isUnscaledLdSt(*LoadI); + assert(IsUnscaled == TII->isUnscaledLdSt(*StoreI) && + "Unsupported ld/st match"); assert(LoadSize <= StoreSize && "Invalid load size"); int UnscaledLdOffset = IsUnscaled - ? getLdStOffsetOp(LoadI).getImm() - : getLdStOffsetOp(LoadI).getImm() * LoadSize; + ? getLdStOffsetOp(*LoadI).getImm() + : getLdStOffsetOp(*LoadI).getImm() * LoadSize; int UnscaledStOffset = IsUnscaled - ? getLdStOffsetOp(StoreI).getImm() - : getLdStOffsetOp(StoreI).getImm() * StoreSize; + ? getLdStOffsetOp(*StoreI).getImm() + : getLdStOffsetOp(*StoreI).getImm() * StoreSize; int Width = LoadSize * 8; int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset); int Imms = Immr + Width - 1; @@ -926,6 +1009,7 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, .addImm(Imms); } } + (void)BitExtMI; DEBUG(dbgs() << "Promoting load by replacing :\n "); DEBUG(StoreI->print(dbgs())); @@ -944,16 +1028,18 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, /// trackRegDefsUses - Remember what registers the specified instruction uses /// and modifies. -static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs, +static void trackRegDefsUses(const MachineInstr &MI, BitVector &ModifiedRegs, BitVector &UsedRegs, const TargetRegisterInfo *TRI) { - for (const MachineOperand &MO : MI->operands()) { + for (const MachineOperand &MO : MI.operands()) { if (MO.isRegMask()) ModifiedRegs.setBitsNotInMask(MO.getRegMask()); if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); + if (!Reg) + continue; if (MO.isDef()) { for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) ModifiedRegs.set(*AI); @@ -968,38 +1054,42 @@ static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs, static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) { // Convert the byte-offset used by unscaled into an "element" offset used // by the scaled pair load/store instructions. - if (IsUnscaled) + if (IsUnscaled) { + // If the byte-offset isn't a multiple of the stride, there's no point + // trying to match it. + if (Offset % OffsetStride) + return false; Offset /= OffsetStride; - + } return Offset <= 63 && Offset >= -64; } // Do alignment, specialized to power of 2 and for signed ints, // avoiding having to do a C-style cast from uint_64t to int when -// using RoundUpToAlignment from include/llvm/Support/MathExtras.h. +// using alignTo from include/llvm/Support/MathExtras.h. // FIXME: Move this function to include/MathExtras.h? static int alignTo(int Num, int PowOf2) { return (Num + PowOf2 - 1) & ~(PowOf2 - 1); } -static bool mayAlias(MachineInstr *MIa, MachineInstr *MIb, +static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb, const AArch64InstrInfo *TII) { // One of the instructions must modify memory. - if (!MIa->mayStore() && !MIb->mayStore()) + if (!MIa.mayStore() && !MIb.mayStore()) return false; // Both instructions must be memory operations. - if (!MIa->mayLoadOrStore() && !MIb->mayLoadOrStore()) + if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore()) return false; return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb); } -static bool mayAlias(MachineInstr *MIa, +static bool mayAlias(MachineInstr &MIa, SmallVectorImpl<MachineInstr *> &MemInsns, const AArch64InstrInfo *TII) { - for (auto &MIb : MemInsns) - if (mayAlias(MIa, MIb, TII)) + for (MachineInstr *MIb : MemInsns) + if (mayAlias(MIa, *MIb, TII)) return true; return false; @@ -1008,40 +1098,43 @@ static bool mayAlias(MachineInstr *MIa, bool AArch64LoadStoreOpt::findMatchingStore( MachineBasicBlock::iterator I, unsigned Limit, MachineBasicBlock::iterator &StoreI) { - MachineBasicBlock::iterator E = I->getParent()->begin(); + MachineBasicBlock::iterator B = I->getParent()->begin(); MachineBasicBlock::iterator MBBI = I; - MachineInstr *FirstMI = I; - unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); + MachineInstr &LoadMI = *I; + unsigned BaseReg = getLdStBaseOp(LoadMI).getReg(); + + // If the load is the first instruction in the block, there's obviously + // not any matching store. + if (MBBI == B) + return false; // Track which registers have been modified and used between the first insn // and the second insn. - BitVector ModifiedRegs, UsedRegs; - ModifiedRegs.resize(TRI->getNumRegs()); - UsedRegs.resize(TRI->getNumRegs()); + ModifiedRegs.reset(); + UsedRegs.reset(); - for (unsigned Count = 0; MBBI != E && Count < Limit;) { + unsigned Count = 0; + do { --MBBI; - MachineInstr *MI = MBBI; - // Skip DBG_VALUE instructions. Otherwise debug info can affect the - // optimization by changing how far we scan. - if (MI->isDebugValue()) - continue; - // Now that we know this is a real instruction, count it. - ++Count; + MachineInstr &MI = *MBBI; + + // Don't count DBG_VALUE instructions towards the search limit. + if (!MI.isDebugValue()) + ++Count; // If the load instruction reads directly from the address to which the // store instruction writes and the stored value is not modified, we can // promote the load. Since we do not handle stores with pre-/post-index, // it's unnecessary to check if BaseReg is modified by the store itself. - if (MI->mayStore() && isMatchingStore(FirstMI, MI) && + if (MI.mayStore() && isMatchingStore(LoadMI, MI) && BaseReg == getLdStBaseOp(MI).getReg() && - isLdOffsetInRangeOfSt(FirstMI, MI) && + isLdOffsetInRangeOfSt(LoadMI, MI, TII) && !ModifiedRegs[getLdStRegOp(MI).getReg()]) { StoreI = MBBI; return true; } - if (MI->isCall()) + if (MI.isCall()) return false; // Update modified / uses register lists. @@ -1053,139 +1146,165 @@ bool AArch64LoadStoreOpt::findMatchingStore( return false; // If we encounter a store aliased with the load, return early. - if (MI->mayStore() && mayAlias(FirstMI, MI, TII)) + if (MI.mayStore() && mayAlias(LoadMI, MI, TII)) return false; - } + } while (MBBI != B && Count < Limit); return false; } -/// findMatchingInsn - Scan the instructions looking for a load/store that can -/// be combined with the current instruction into a load/store pair. +// Returns true if FirstMI and MI are candidates for merging or pairing. +// Otherwise, returns false. +static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI, + LdStPairFlags &Flags, + const AArch64InstrInfo *TII) { + // If this is volatile or if pairing is suppressed, not a candidate. + if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI)) + return false; + + // We should have already checked FirstMI for pair suppression and volatility. + assert(!FirstMI.hasOrderedMemoryRef() && + !TII->isLdStPairSuppressed(FirstMI) && + "FirstMI shouldn't get here if either of these checks are true."); + + unsigned OpcA = FirstMI.getOpcode(); + unsigned OpcB = MI.getOpcode(); + + // Opcodes match: nothing more to check. + if (OpcA == OpcB) + return true; + + // Try to match a sign-extended load/store with a zero-extended load/store. + bool IsValidLdStrOpc, PairIsValidLdStrOpc; + unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc); + assert(IsValidLdStrOpc && + "Given Opc should be a Load or Store with an immediate"); + // OpcA will be the first instruction in the pair. + if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) { + Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0); + return true; + } + + // If the second instruction isn't even a load/store, bail out. + if (!PairIsValidLdStrOpc) + return false; + + // FIXME: We don't support merging narrow loads/stores with mixed + // scaled/unscaled offsets. + if (isNarrowLoadOrStore(OpcA) || isNarrowLoadOrStore(OpcB)) + return false; + + // Try to match an unscaled load/store with a scaled load/store. + return TII->isUnscaledLdSt(OpcA) != TII->isUnscaledLdSt(OpcB) && + getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB); + + // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair? +} + +/// Scan the instructions looking for a load/store that can be combined with the +/// current instruction into a wider equivalent or a load/store pair. MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, - LdStPairFlags &Flags, unsigned Limit) { + LdStPairFlags &Flags, unsigned Limit, + bool FindNarrowMerge) { MachineBasicBlock::iterator E = I->getParent()->end(); MachineBasicBlock::iterator MBBI = I; - MachineInstr *FirstMI = I; + MachineInstr &FirstMI = *I; ++MBBI; - unsigned Opc = FirstMI->getOpcode(); - bool MayLoad = FirstMI->mayLoad(); - bool IsUnscaled = isUnscaledLdSt(FirstMI); + bool MayLoad = FirstMI.mayLoad(); + bool IsUnscaled = TII->isUnscaledLdSt(FirstMI); unsigned Reg = getLdStRegOp(FirstMI).getReg(); unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); int Offset = getLdStOffsetOp(FirstMI).getImm(); - bool IsNarrowStore = isNarrowStore(Opc); - - // For narrow stores, find only the case where the stored value is WZR. - if (IsNarrowStore && Reg != AArch64::WZR) - return E; - - // Early exit if the first instruction modifies the base register. - // e.g., ldr x0, [x0] - if (FirstMI->modifiesRegister(BaseReg, TRI)) - return E; - - // Early exit if the offset if not possible to match. (6 bits of positive - // range, plus allow an extra one in case we find a later insn that matches - // with Offset-1) int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1; - if (!(isNarrowLoad(Opc) || IsNarrowStore) && - !inBoundsForPair(IsUnscaled, Offset, OffsetStride)) - return E; + bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI); // Track which registers have been modified and used between the first insn // (inclusive) and the second insn. - BitVector ModifiedRegs, UsedRegs; - ModifiedRegs.resize(TRI->getNumRegs()); - UsedRegs.resize(TRI->getNumRegs()); + ModifiedRegs.reset(); + UsedRegs.reset(); // Remember any instructions that read/write memory between FirstMI and MI. SmallVector<MachineInstr *, 4> MemInsns; for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { - MachineInstr *MI = MBBI; + MachineInstr &MI = *MBBI; // Skip DBG_VALUE instructions. Otherwise debug info can affect the // optimization by changing how far we scan. - if (MI->isDebugValue()) + if (MI.isDebugValue()) continue; // Now that we know this is a real instruction, count it. ++Count; - bool CanMergeOpc = Opc == MI->getOpcode(); Flags.setSExtIdx(-1); - if (!CanMergeOpc) { - bool IsValidLdStrOpc; - unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc); - assert(IsValidLdStrOpc && - "Given Opc should be a Load or Store with an immediate"); - // Opc will be the first instruction in the pair. - Flags.setSExtIdx(NonSExtOpc == (unsigned)Opc ? 1 : 0); - CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode()); - } - - if (CanMergeOpc && getLdStOffsetOp(MI).isImm()) { - assert(MI->mayLoadOrStore() && "Expected memory operation."); + if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) && + getLdStOffsetOp(MI).isImm()) { + assert(MI.mayLoadOrStore() && "Expected memory operation."); // If we've found another instruction with the same opcode, check to see // if the base and offset are compatible with our starting instruction. // These instructions all have scaled immediate operands, so we just // check for +1/-1. Make sure to check the new instruction offset is // actually an immediate and not a symbolic reference destined for // a relocation. - // - // Pairwise instructions have a 7-bit signed offset field. Single insns - // have a 12-bit unsigned offset field. To be a valid combine, the - // final offset must be in range. unsigned MIBaseReg = getLdStBaseOp(MI).getReg(); int MIOffset = getLdStOffsetOp(MI).getImm(); + bool MIIsUnscaled = TII->isUnscaledLdSt(MI); + if (IsUnscaled != MIIsUnscaled) { + // We're trying to pair instructions that differ in how they are scaled. + // If FirstMI is scaled then scale the offset of MI accordingly. + // Otherwise, do the opposite (i.e., make MI's offset unscaled). + int MemSize = getMemScale(MI); + if (MIIsUnscaled) { + // If the unscaled offset isn't a multiple of the MemSize, we can't + // pair the operations together: bail and keep looking. + if (MIOffset % MemSize) + continue; + MIOffset /= MemSize; + } else { + MIOffset *= MemSize; + } + } + if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) || (Offset + OffsetStride == MIOffset))) { int MinOffset = Offset < MIOffset ? Offset : MIOffset; - // If this is a volatile load/store that otherwise matched, stop looking - // as something is going on that we don't have enough information to - // safely transform. Similarly, stop if we see a hint to avoid pairs. - if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI)) - return E; - // If the resultant immediate offset of merging these instructions - // is out of range for a pairwise instruction, bail and keep looking. - bool MIIsUnscaled = isUnscaledLdSt(MI); - bool IsNarrowLoad = isNarrowLoad(MI->getOpcode()); - if (!IsNarrowLoad && - !inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) { - trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - MemInsns.push_back(MI); - continue; - } - - if (IsNarrowLoad || IsNarrowStore) { + if (FindNarrowMerge) { // If the alignment requirements of the scaled wide load/store - // instruction can't express the offset of the scaled narrow - // input, bail and keep looking. - if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) { + // instruction can't express the offset of the scaled narrow input, + // bail and keep looking. For promotable zero stores, allow only when + // the stored value is the same (i.e., WZR). + if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) || + (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) { trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - MemInsns.push_back(MI); + MemInsns.push_back(&MI); continue; } } else { + // Pairwise instructions have a 7-bit signed offset field. Single + // insns have a 12-bit unsigned offset field. If the resultant + // immediate offset of merging these instructions is out of range for + // a pairwise instruction, bail and keep looking. + if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) { + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + MemInsns.push_back(&MI); + continue; + } // If the alignment requirements of the paired (scaled) instruction // can't express the offset of the unscaled input, bail and keep // looking. if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) { trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - MemInsns.push_back(MI); + MemInsns.push_back(&MI); continue; } } // If the destination register of the loads is the same register, bail // and keep looking. A load-pair instruction with both destination // registers the same is UNPREDICTABLE and will result in an exception. - // For narrow stores, allow only when the stored value is the same - // (i.e., WZR). - if ((MayLoad && Reg == getLdStRegOp(MI).getReg()) || - (IsNarrowStore && Reg != getLdStRegOp(MI).getReg())) { + if (MayLoad && Reg == getLdStRegOp(MI).getReg()) { trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - MemInsns.push_back(MI); + MemInsns.push_back(&MI); continue; } @@ -1194,7 +1313,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // and first alias with the second, we can combine the second into the // first. if (!ModifiedRegs[getLdStRegOp(MI).getReg()] && - !(MI->mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) && + !(MI.mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) && !mayAlias(MI, MemInsns, TII)) { Flags.setMergeForward(false); return MBBI; @@ -1217,7 +1336,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // If the instruction wasn't a matching load or store. Stop searching if we // encounter a call instruction that might modify memory. - if (MI->isCall()) + if (MI.isCall()) return E; // Update modified / uses register lists. @@ -1229,8 +1348,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, return E; // Update list of instructions that read/write memory. - if (MI->mayLoadOrStore()) - MemInsns.push_back(MI); + if (MI.mayLoadOrStore()) + MemInsns.push_back(&MI); } return E; } @@ -1258,22 +1377,24 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode()) : getPostIndexedOpcode(I->getOpcode()); MachineInstrBuilder MIB; - if (!isPairedLdSt(I)) { + if (!isPairedLdSt(*I)) { // Non-paired instruction. MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) - .addOperand(getLdStRegOp(Update)) - .addOperand(getLdStRegOp(I)) - .addOperand(getLdStBaseOp(I)) - .addImm(Value); + .addOperand(getLdStRegOp(*Update)) + .addOperand(getLdStRegOp(*I)) + .addOperand(getLdStBaseOp(*I)) + .addImm(Value) + .setMemRefs(I->memoperands_begin(), I->memoperands_end()); } else { // Paired instruction. - int Scale = getMemScale(I); + int Scale = getMemScale(*I); MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) - .addOperand(getLdStRegOp(Update)) - .addOperand(getLdStRegOp(I, 0)) - .addOperand(getLdStRegOp(I, 1)) - .addOperand(getLdStBaseOp(I)) - .addImm(Value / Scale); + .addOperand(getLdStRegOp(*Update)) + .addOperand(getLdStRegOp(*I, 0)) + .addOperand(getLdStRegOp(*I, 1)) + .addOperand(getLdStBaseOp(*I)) + .addImm(Value / Scale) + .setMemRefs(I->memoperands_begin(), I->memoperands_end()); } (void)MIB; @@ -1296,10 +1417,10 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, return NextI; } -bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI, - MachineInstr *MI, +bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI, + MachineInstr &MI, unsigned BaseReg, int Offset) { - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: break; case AArch64::SUBXri: @@ -1309,20 +1430,20 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI, case AArch64::ADDXri: // Make sure it's a vanilla immediate operand, not a relocation or // anything else we can't handle. - if (!MI->getOperand(2).isImm()) + if (!MI.getOperand(2).isImm()) break; // Watch out for 1 << 12 shifted value. - if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm())) + if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm())) break; // The update instruction source and destination register must be the // same as the load/store base register. - if (MI->getOperand(0).getReg() != BaseReg || - MI->getOperand(1).getReg() != BaseReg) + if (MI.getOperand(0).getReg() != BaseReg || + MI.getOperand(1).getReg() != BaseReg) break; bool IsPairedInsn = isPairedLdSt(MemMI); - int UpdateOffset = MI->getOperand(2).getImm(); + int UpdateOffset = MI.getOperand(2).getImm(); // For non-paired load/store instructions, the immediate must fit in a // signed 9-bit integer. if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256)) @@ -1343,7 +1464,7 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI, // If we have a non-zero Offset, we check that it matches the amount // we're adding to the register. - if (!Offset || Offset == MI->getOperand(2).getImm()) + if (!Offset || Offset == MI.getOperand(2).getImm()) return true; break; } @@ -1351,9 +1472,9 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI, } MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( - MachineBasicBlock::iterator I, unsigned Limit, int UnscaledOffset) { + MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) { MachineBasicBlock::iterator E = I->getParent()->end(); - MachineInstr *MemMI = I; + MachineInstr &MemMI = *I; MachineBasicBlock::iterator MBBI = I; unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); @@ -1376,22 +1497,20 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( // Track which registers have been modified and used between the first insn // (inclusive) and the second insn. - BitVector ModifiedRegs, UsedRegs; - ModifiedRegs.resize(TRI->getNumRegs()); - UsedRegs.resize(TRI->getNumRegs()); + ModifiedRegs.reset(); + UsedRegs.reset(); ++MBBI; - for (unsigned Count = 0; MBBI != E; ++MBBI) { - MachineInstr *MI = MBBI; - // Skip DBG_VALUE instructions. Otherwise debug info can affect the - // optimization by changing how far we scan. - if (MI->isDebugValue()) + for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { + MachineInstr &MI = *MBBI; + // Skip DBG_VALUE instructions. + if (MI.isDebugValue()) continue; // Now that we know this is a real instruction, count it. ++Count; // If we found a match, return it. - if (isMatchingUpdateInsn(I, MI, BaseReg, UnscaledOffset)) + if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset)) return MBBI; // Update the status of what the instruction clobbered and used. @@ -1409,7 +1528,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( MachineBasicBlock::iterator I, unsigned Limit) { MachineBasicBlock::iterator B = I->getParent()->begin(); MachineBasicBlock::iterator E = I->getParent()->end(); - MachineInstr *MemMI = I; + MachineInstr &MemMI = *I; MachineBasicBlock::iterator MBBI = I; unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); @@ -1430,22 +1549,19 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( // Track which registers have been modified and used between the first insn // (inclusive) and the second insn. - BitVector ModifiedRegs, UsedRegs; - ModifiedRegs.resize(TRI->getNumRegs()); - UsedRegs.resize(TRI->getNumRegs()); - --MBBI; - for (unsigned Count = 0; MBBI != B; --MBBI) { - MachineInstr *MI = MBBI; - // Skip DBG_VALUE instructions. Otherwise debug info can affect the - // optimization by changing how far we scan. - if (MI->isDebugValue()) - continue; + ModifiedRegs.reset(); + UsedRegs.reset(); + unsigned Count = 0; + do { + --MBBI; + MachineInstr &MI = *MBBI; - // Now that we know this is a real instruction, count it. - ++Count; + // Don't count DBG_VALUE instructions towards the search limit. + if (!MI.isDebugValue()) + ++Count; // If we found a match, return it. - if (isMatchingUpdateInsn(I, MI, BaseReg, Offset)) + if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset)) return MBBI; // Update the status of what the instruction clobbered and used. @@ -1455,15 +1571,15 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( // return early. if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) return E; - } + } while (MBBI != B && Count < Limit); return E; } bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore( MachineBasicBlock::iterator &MBBI) { - MachineInstr *MI = MBBI; + MachineInstr &MI = *MBBI; // If this is a volatile load, don't mess with it. - if (MI->hasOrderedMemoryRef()) + if (MI.hasOrderedMemoryRef()) return false; // Make sure this is a reg+imm. @@ -1471,9 +1587,9 @@ bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore( if (!getLdStOffsetOp(MI).isImm()) return false; - // Look backward up to ScanLimit instructions. + // Look backward up to LdStLimit instructions. MachineBasicBlock::iterator StoreI; - if (findMatchingStore(MBBI, ScanLimit, StoreI)) { + if (findMatchingStore(MBBI, LdStLimit, StoreI)) { ++NumLoadsFromStoresPromoted; // Promote the load. Keeping the iterator straight is a // pain, so we let the merge routine tell us what the next instruction @@ -1484,40 +1600,70 @@ bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore( return false; } +// Find narrow loads that can be converted into a single wider load with +// bitfield extract instructions. Also merge adjacent zero stores into a wider +// store. bool AArch64LoadStoreOpt::tryToMergeLdStInst( MachineBasicBlock::iterator &MBBI) { - MachineInstr *MI = MBBI; - MachineBasicBlock::iterator E = MI->getParent()->end(); - // If this is a volatile load/store, don't mess with it. - if (MI->hasOrderedMemoryRef()) - return false; + assert((isNarrowLoad(*MBBI) || isPromotableZeroStoreOpcode(*MBBI)) && + "Expected narrow op."); + MachineInstr &MI = *MBBI; + MachineBasicBlock::iterator E = MI.getParent()->end(); - // Make sure this is a reg+imm (as opposed to an address reloc). - if (!getLdStOffsetOp(MI).isImm()) + if (!TII->isCandidateToMergeOrPair(MI)) return false; - // Check if this load/store has a hint to avoid pair formation. - // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. - if (TII->isLdStPairSuppressed(MI)) + // For promotable zero stores, the stored value should be WZR. + if (isPromotableZeroStoreOpcode(MI) && + getLdStRegOp(MI).getReg() != AArch64::WZR) return false; - // Look ahead up to ScanLimit instructions for a pairable instruction. + // Look ahead up to LdStLimit instructions for a mergable instruction. LdStPairFlags Flags; - MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit); - if (Paired != E) { + MachineBasicBlock::iterator MergeMI = + findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true); + if (MergeMI != E) { if (isNarrowLoad(MI)) { ++NumNarrowLoadsPromoted; - } else if (isNarrowStore(MI)) { + } else if (isPromotableZeroStoreInst(MI)) { ++NumZeroStoresPromoted; - } else { - ++NumPairCreated; - if (isUnscaledLdSt(MI)) - ++NumUnscaledPairCreated; } + // Keeping the iterator straight is a pain, so we let the merge routine tell + // us what the next instruction is after it's done mucking about. + MBBI = mergeNarrowInsns(MBBI, MergeMI, Flags); + return true; + } + return false; +} - // Merge the loads into a pair. Keeping the iterator straight is a - // pain, so we let the merge routine tell us what the next instruction - // is after it's done mucking about. +// Find loads and stores that can be merged into a single load or store pair +// instruction. +bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) { + MachineInstr &MI = *MBBI; + MachineBasicBlock::iterator E = MI.getParent()->end(); + + if (!TII->isCandidateToMergeOrPair(MI)) + return false; + + // Early exit if the offset is not possible to match. (6 bits of positive + // range, plus allow an extra one in case we find a later insn that matches + // with Offset-1) + bool IsUnscaled = TII->isUnscaledLdSt(MI); + int Offset = getLdStOffsetOp(MI).getImm(); + int OffsetStride = IsUnscaled ? getMemScale(MI) : 1; + if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride)) + return false; + + // Look ahead up to LdStLimit instructions for a pairable instruction. + LdStPairFlags Flags; + MachineBasicBlock::iterator Paired = + findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false); + if (Paired != E) { + ++NumPairCreated; + if (TII->isUnscaledLdSt(MI)) + ++NumUnscaledPairCreated; + // Keeping the iterator straight is a pain, so we let the merge routine tell + // us what the next instruction is after it's done mucking about. MBBI = mergePairedInsns(MBBI, Paired, Flags); return true; } @@ -1527,7 +1673,7 @@ bool AArch64LoadStoreOpt::tryToMergeLdStInst( bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt) { bool Modified = false; - // Three tranformations to do here: + // Four tranformations to do here: // 1) Find loads that directly read from stores and promote them by // replacing with mov instructions. If the store is wider than the load, // the load will be replaced with a bitfield extract. @@ -1536,35 +1682,11 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, // ldrh w2, [x0, #6] // ; becomes // str w1, [x0, #4] - // lsr w2, w1, #16 - // 2) Find narrow loads that can be converted into a single wider load - // with bitfield extract instructions. - // e.g., - // ldrh w0, [x2] - // ldrh w1, [x2, #2] - // ; becomes - // ldr w0, [x2] - // ubfx w1, w0, #16, #16 - // and w0, w0, #ffff - // 3) Find loads and stores that can be merged into a single load or store - // pair instruction. - // e.g., - // ldr x0, [x2] - // ldr x1, [x2, #8] - // ; becomes - // ldp x0, x1, [x2] - // 4) Find base register updates that can be merged into the load or store - // as a base-reg writeback. - // e.g., - // ldr x0, [x2] - // add x2, x2, #4 - // ; becomes - // ldr x0, [x2], #4 - + // lsr w2, w1, #16 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) { - MachineInstr *MI = MBBI; - switch (MI->getOpcode()) { + MachineInstr &MI = *MBBI; + switch (MI.getOpcode()) { default: // Just move on to the next instruction. ++MBBI; @@ -1586,47 +1708,49 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, ++MBBI; break; } - // FIXME: Do the other instructions. } } - + // 2) Find narrow loads that can be converted into a single wider load + // with bitfield extract instructions. + // e.g., + // ldrh w0, [x2] + // ldrh w1, [x2, #2] + // ; becomes + // ldr w0, [x2] + // ubfx w1, w0, #16, #16 + // and w0, w0, #ffff + // + // Also merge adjacent zero stores into a wider store. + // e.g., + // strh wzr, [x0] + // strh wzr, [x0, #2] + // ; becomes + // str wzr, [x0] for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); enableNarrowLdOpt && MBBI != E;) { - MachineInstr *MI = MBBI; - switch (MI->getOpcode()) { - default: - // Just move on to the next instruction. - ++MBBI; - break; - // Scaled instructions. - case AArch64::LDRBBui: - case AArch64::LDRHHui: - case AArch64::LDRSBWui: - case AArch64::LDRSHWui: - case AArch64::STRBBui: - case AArch64::STRHHui: - // Unscaled instructions. - case AArch64::LDURBBi: - case AArch64::LDURHHi: - case AArch64::LDURSBWi: - case AArch64::LDURSHWi: - case AArch64::STURBBi: - case AArch64::STURHHi: { + MachineInstr &MI = *MBBI; + unsigned Opc = MI.getOpcode(); + if (isPromotableZeroStoreOpcode(Opc) || + (EnableNarrowLdMerge && isNarrowLoad(Opc))) { if (tryToMergeLdStInst(MBBI)) { Modified = true; - break; - } + } else + ++MBBI; + } else ++MBBI; - break; - } - // FIXME: Do the other instructions. - } } + // 3) Find loads and stores that can be merged into a single load or store + // pair instruction. + // e.g., + // ldr x0, [x2] + // ldr x1, [x2, #8] + // ; becomes + // ldp x0, x1, [x2] for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) { - MachineInstr *MI = MBBI; - switch (MI->getOpcode()) { + MachineInstr &MI = *MBBI; + switch (MI.getOpcode()) { default: // Just move on to the next instruction. ++MBBI; @@ -1655,23 +1779,28 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, case AArch64::LDURWi: case AArch64::LDURXi: case AArch64::LDURSWi: { - if (tryToMergeLdStInst(MBBI)) { + if (tryToPairLdStInst(MBBI)) { Modified = true; break; } ++MBBI; break; } - // FIXME: Do the other instructions. } } - + // 4) Find base register updates that can be merged into the load or store + // as a base-reg writeback. + // e.g., + // ldr x0, [x2] + // add x2, x2, #4 + // ; becomes + // ldr x0, [x2], #4 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) { - MachineInstr *MI = MBBI; + MachineInstr &MI = *MBBI; // Do update merging. It's simpler to keep this separate from the above - // switch, though not strictly necessary. - unsigned Opc = MI->getOpcode(); + // switchs, though not strictly necessary. + unsigned Opc = MI.getOpcode(); switch (Opc) { default: // Just move on to the next instruction. @@ -1726,7 +1855,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, // merged into: // ldr x0, [x20], #32 MachineBasicBlock::iterator Update = - findMatchingUpdateInsnForward(MBBI, ScanLimit, 0); + findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit); if (Update != E) { // Merge the update into the ld/st. MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false); @@ -1736,7 +1865,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, } // Don't know how to handle pre/post-index versions, so move to the next // instruction. - if (isUnscaledLdSt(Opc)) { + if (TII->isUnscaledLdSt(Opc)) { ++MBBI; break; } @@ -1746,7 +1875,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, // ldr x1, [x0] // merged into: // ldr x1, [x0, #8]! - Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit); + Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit); if (Update != E) { // Merge the update into the ld/st. MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); @@ -1764,7 +1893,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, // add x0, x0, #64 // merged into: // ldr x1, [x0, #64]! - Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, UnscaledOffset); + Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit); if (Update != E) { // Merge the update into the ld/st. MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); @@ -1777,29 +1906,29 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, ++MBBI; break; } - // FIXME: Do the other instructions. } } return Modified; } -bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) { - bool ProfitableArch = Subtarget->isCortexA57(); - // FIXME: The benefit from converting narrow loads into a wider load could be - // microarchitectural as it assumes that a single load with two bitfield - // extracts is cheaper than two narrow loads. Currently, this conversion is - // enabled only in cortex-a57 on which performance benefits were verified. - return ProfitableArch && !Subtarget->requiresStrictAlign(); -} - bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { + if (skipFunction(*Fn.getFunction())) + return false; + Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget()); TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo()); TRI = Subtarget->getRegisterInfo(); + // Resize the modified and used register bitfield trackers. We do this once + // per function and then clear the bitfield each time we optimize a load or + // store. + ModifiedRegs.resize(TRI->getNumRegs()); + UsedRegs.resize(TRI->getNumRegs()); + bool Modified = false; - bool enableNarrowLdOpt = enableNarrowLdMerge(Fn); + bool enableNarrowLdOpt = + Subtarget->mergeNarrowLoads() && !Subtarget->requiresStrictAlign(); for (auto &MBB : Fn) Modified |= optimizeBlock(MBB, enableNarrowLdOpt); @@ -1809,6 +1938,11 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep // loads and stores near one another? +// FIXME: When pairing store instructions it's very possible for this pass to +// hoist a store with a KILL marker above another use (without a KILL marker). +// The resulting IR is invalid, but nothing uses the KILL markers after this +// pass, so it's never caused a problem in practice. + /// createAArch64LoadStoreOptimizationPass - returns an instance of the /// load / store optimization pass. FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() { diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 318f839535053..49e7767741eaa 100644 --- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -48,6 +48,9 @@ class AArch64FunctionInfo : public MachineFunctionInfo { /// \brief Amount of stack frame size, not including callee-saved registers. unsigned LocalStackSize; + /// \brief Amount of stack frame size used for saving callee-saved registers. + unsigned CalleeSavedStackSize; + /// \brief Number of TLS accesses using the special (combinable) /// _TLS_MODULE_BASE_ symbol. unsigned NumLocalDynamicTLSAccesses; @@ -76,18 +79,28 @@ class AArch64FunctionInfo : public MachineFunctionInfo { /// copies. bool IsSplitCSR; + /// True when the stack gets realigned dynamically because the size of stack + /// frame is unknown at compile time. e.g., in case of VLAs. + bool StackRealigned; + + /// True when the callee-save stack area has unused gaps that may be used for + /// other stack allocations. + bool CalleeSaveStackHasFreeSpace; + public: AArch64FunctionInfo() : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false), NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0), VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0), - IsSplitCSR(false) {} + IsSplitCSR(false), StackRealigned(false), + CalleeSaveStackHasFreeSpace(false) {} explicit AArch64FunctionInfo(MachineFunction &MF) : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false), NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0), VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0), - IsSplitCSR(false) { + IsSplitCSR(false), StackRealigned(false), + CalleeSaveStackHasFreeSpace(false) { (void)MF; } @@ -102,12 +115,25 @@ public: bool hasStackFrame() const { return HasStackFrame; } void setHasStackFrame(bool s) { HasStackFrame = s; } + bool isStackRealigned() const { return StackRealigned; } + void setStackRealigned(bool s) { StackRealigned = s; } + + bool hasCalleeSaveStackFreeSpace() const { + return CalleeSaveStackHasFreeSpace; + } + void setCalleeSaveStackHasFreeSpace(bool s) { + CalleeSaveStackHasFreeSpace = s; + } + bool isSplitCSR() const { return IsSplitCSR; } void setIsSplitCSR(bool s) { IsSplitCSR = s; } void setLocalStackSize(unsigned Size) { LocalStackSize = Size; } unsigned getLocalStackSize() const { return LocalStackSize; } + void setCalleeSavedStackSize(unsigned Size) { CalleeSavedStackSize = Size; } + unsigned getCalleeSavedStackSize() const { return CalleeSavedStackSize; } + void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; } unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamicTLSAccesses; @@ -140,15 +166,15 @@ public: SmallVector<const MachineInstr *, 3> Args; public: - typedef SmallVectorImpl<const MachineInstr *> LOHArgs; + typedef ArrayRef<const MachineInstr *> LOHArgs; - MILOHDirective(MCLOHType Kind, const LOHArgs &Args) + MILOHDirective(MCLOHType Kind, LOHArgs Args) : Kind(Kind), Args(Args.begin(), Args.end()) { assert(isValidMCLOHType(Kind) && "Invalid LOH directive type!"); } MCLOHType getKind() const { return Kind; } - const LOHArgs &getArgs() const { return Args; } + LOHArgs getArgs() const { return Args; } }; typedef MILOHDirective::LOHArgs MILOHArgs; @@ -157,7 +183,7 @@ public: const MILOHContainer &getLOHContainer() const { return LOHContainerSet; } /// Add a LOH directive of this @p Kind and this @p Args. - void addLOHDirective(MCLOHType Kind, const MILOHArgs &Args) { + void addLOHDirective(MCLOHType Kind, MILOHArgs Args) { LOHContainerSet.push_back(MILOHDirective(Kind, Args)); LOHRelated.insert(Args.begin(), Args.end()); } diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp index 5394875a6bc12..038162c6f54a9 100644 --- a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp +++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp @@ -320,7 +320,7 @@ void A57ChainingConstraint::addInterChainConstraint(PBQPRAGraph &G, unsigned Rd, static bool regJustKilledBefore(const LiveIntervals &LIs, unsigned reg, const MachineInstr &MI) { const LiveInterval &LI = LIs.getInterval(reg); - SlotIndex SI = LIs.getInstructionIndex(&MI); + SlotIndex SI = LIs.getInstructionIndex(MI); return LI.expiredAt(SI); } diff --git a/lib/Target/AArch64/AArch64PromoteConstant.cpp b/lib/Target/AArch64/AArch64PromoteConstant.cpp index 79c09d9f058d6..b1e40510b2ae7 100644 --- a/lib/Target/AArch64/AArch64PromoteConstant.cpp +++ b/lib/Target/AArch64/AArch64PromoteConstant.cpp @@ -85,6 +85,21 @@ namespace { class AArch64PromoteConstant : public ModulePass { public: + struct PromotedConstant { + bool ShouldConvert = false; + GlobalVariable *GV = nullptr; + }; + typedef SmallDenseMap<Constant *, PromotedConstant, 16> PromotionCacheTy; + + struct UpdateRecord { + Constant *C; + Instruction *User; + unsigned Op; + + UpdateRecord(Constant *C, Instruction *User, unsigned Op) + : C(C), User(User), Op(Op) {} + }; + static char ID; AArch64PromoteConstant() : ModulePass(ID) {} @@ -94,9 +109,12 @@ public: /// global variables with module scope. bool runOnModule(Module &M) override { DEBUG(dbgs() << getPassName() << '\n'); + if (skipModule(M)) + return false; bool Changed = false; + PromotionCacheTy PromotionCache; for (auto &MF : M) { - Changed |= runOnFunction(MF); + Changed |= runOnFunction(MF, PromotionCache); } return Changed; } @@ -105,7 +123,7 @@ private: /// Look for interesting constants used within the given function. /// Promote them into global variables, load these global variables within /// the related function, so that the number of inserted load is minimal. - bool runOnFunction(Function &F); + bool runOnFunction(Function &F, PromotionCacheTy &PromotionCache); // This transformation requires dominator info void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -115,79 +133,72 @@ private: } /// Type to store a list of Uses. - typedef SmallVector<Use *, 4> Uses; + typedef SmallVector<std::pair<Instruction *, unsigned>, 4> Uses; /// Map an insertion point to all the uses it dominates. typedef DenseMap<Instruction *, Uses> InsertionPoints; - /// Map a function to the required insertion point of load for a - /// global variable. - typedef DenseMap<Function *, InsertionPoints> InsertionPointsPerFunc; /// Find the closest point that dominates the given Use. - Instruction *findInsertionPoint(Use &Use); + Instruction *findInsertionPoint(Instruction &User, unsigned OpNo); /// Check if the given insertion point is dominated by an existing /// insertion point. /// If true, the given use is added to the list of dominated uses for /// the related existing point. /// \param NewPt the insertion point to be checked - /// \param Use the use to be added into the list of dominated uses + /// \param User the user of the constant + /// \param OpNo the operand number of the use /// \param InsertPts existing insertion points /// \pre NewPt and all instruction in InsertPts belong to the same function /// \return true if one of the insertion point in InsertPts dominates NewPt, /// false otherwise - bool isDominated(Instruction *NewPt, Use &Use, InsertionPoints &InsertPts); + bool isDominated(Instruction *NewPt, Instruction *User, unsigned OpNo, + InsertionPoints &InsertPts); /// Check if the given insertion point can be merged with an existing /// insertion point in a common dominator. /// If true, the given use is added to the list of the created insertion /// point. /// \param NewPt the insertion point to be checked - /// \param Use the use to be added into the list of dominated uses + /// \param User the user of the constant + /// \param OpNo the operand number of the use /// \param InsertPts existing insertion points /// \pre NewPt and all instruction in InsertPts belong to the same function /// \pre isDominated returns false for the exact same parameters. /// \return true if it exists an insertion point in InsertPts that could /// have been merged with NewPt in a common dominator, /// false otherwise - bool tryAndMerge(Instruction *NewPt, Use &Use, InsertionPoints &InsertPts); + bool tryAndMerge(Instruction *NewPt, Instruction *User, unsigned OpNo, + InsertionPoints &InsertPts); /// Compute the minimal insertion points to dominates all the interesting /// uses of value. /// Insertion points are group per function and each insertion point /// contains a list of all the uses it dominates within the related function - /// \param Val constant to be examined - /// \param[out] InsPtsPerFunc output storage of the analysis - void computeInsertionPoints(Constant *Val, - InsertionPointsPerFunc &InsPtsPerFunc); + /// \param User the user of the constant + /// \param OpNo the operand number of the constant + /// \param[out] InsertPts output storage of the analysis + void computeInsertionPoint(Instruction *User, unsigned OpNo, + InsertionPoints &InsertPts); /// Insert a definition of a new global variable at each point contained in /// InsPtsPerFunc and update the related uses (also contained in /// InsPtsPerFunc). - bool insertDefinitions(Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc); - - /// Compute the minimal insertion points to dominate all the interesting - /// uses of Val and insert a definition of a new global variable - /// at these points. - /// Also update the uses of Val accordingly. - /// Currently a use of Val is considered interesting if: - /// - Val is not UndefValue - /// - Val is not zeroinitialized - /// - Replacing Val per a load of a global variable is valid. - /// \see shouldConvert for more details - bool computeAndInsertDefinitions(Constant *Val); - - /// Promote the given constant into a global variable if it is expected to - /// be profitable. - /// \return true if Cst has been promoted - bool promoteConstant(Constant *Cst); + void insertDefinitions(Function &F, GlobalVariable &GV, + InsertionPoints &InsertPts); + + /// Do the constant promotion indicated by the Updates records, keeping track + /// of globals in PromotionCache. + void promoteConstants(Function &F, SmallVectorImpl<UpdateRecord> &Updates, + PromotionCacheTy &PromotionCache); /// Transfer the list of dominated uses of IPI to NewPt in InsertPts. /// Append Use to this list and delete the entry of IPI in InsertPts. - static void appendAndTransferDominatedUses(Instruction *NewPt, Use &Use, + static void appendAndTransferDominatedUses(Instruction *NewPt, + Instruction *User, unsigned OpNo, InsertionPoints::iterator &IPI, InsertionPoints &InsertPts) { // Record the dominated use. - IPI->second.push_back(&Use); + IPI->second.emplace_back(User, OpNo); // Transfer the dominated uses of IPI to NewPt // Inserting into the DenseMap may invalidate existing iterator. // Keep a copy of the key to find the iterator to erase. Keep a copy of the @@ -285,10 +296,7 @@ static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr, // Do not mess with inline asm. const CallInst *CI = dyn_cast<const CallInst>(Instr); - if (CI && isa<const InlineAsm>(CI->getCalledValue())) - return false; - - return true; + return !(CI && isa<const InlineAsm>(CI->getCalledValue())); } /// Check if the given Cst should be converted into @@ -305,7 +313,7 @@ static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr, /// for the regular approach, even for float). /// Again, the simplest solution would be to promote every /// constant and rematerialize them when they are actually cheap to create. -static bool shouldConvert(const Constant *Cst) { +static bool shouldConvertImpl(const Constant *Cst) { if (isa<const UndefValue>(Cst)) return false; @@ -328,18 +336,28 @@ static bool shouldConvert(const Constant *Cst) { return isConstantUsingVectorTy(Cst->getType()); } -Instruction *AArch64PromoteConstant::findInsertionPoint(Use &Use) { - Instruction *User = cast<Instruction>(Use.getUser()); +static bool +shouldConvert(Constant &C, + AArch64PromoteConstant::PromotionCacheTy &PromotionCache) { + auto Converted = PromotionCache.insert( + std::make_pair(&C, AArch64PromoteConstant::PromotedConstant())); + if (Converted.second) + Converted.first->second.ShouldConvert = shouldConvertImpl(&C); + return Converted.first->second.ShouldConvert; +} +Instruction *AArch64PromoteConstant::findInsertionPoint(Instruction &User, + unsigned OpNo) { // If this user is a phi, the insertion point is in the related // incoming basic block. - if (PHINode *PhiInst = dyn_cast<PHINode>(User)) - return PhiInst->getIncomingBlock(Use.getOperandNo())->getTerminator(); + if (PHINode *PhiInst = dyn_cast<PHINode>(&User)) + return PhiInst->getIncomingBlock(OpNo)->getTerminator(); - return User; + return &User; } -bool AArch64PromoteConstant::isDominated(Instruction *NewPt, Use &Use, +bool AArch64PromoteConstant::isDominated(Instruction *NewPt, Instruction *User, + unsigned OpNo, InsertionPoints &InsertPts) { DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>( @@ -358,14 +376,15 @@ bool AArch64PromoteConstant::isDominated(Instruction *NewPt, Use &Use, DEBUG(dbgs() << "Insertion point dominated by:\n"); DEBUG(IPI.first->print(dbgs())); DEBUG(dbgs() << '\n'); - IPI.second.push_back(&Use); + IPI.second.emplace_back(User, OpNo); return true; } } return false; } -bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Use &Use, +bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Instruction *User, + unsigned OpNo, InsertionPoints &InsertPts) { DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>( *NewPt->getParent()->getParent()).getDomTree(); @@ -385,7 +404,7 @@ bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Use &Use, DEBUG(dbgs() << "Merge insertion point with:\n"); DEBUG(IPI->first->print(dbgs())); DEBUG(dbgs() << "\nat considered insertion point.\n"); - appendAndTransferDominatedUses(NewPt, Use, IPI, InsertPts); + appendAndTransferDominatedUses(NewPt, User, OpNo, IPI, InsertPts); return true; } @@ -409,149 +428,141 @@ bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Use &Use, DEBUG(dbgs() << '\n'); DEBUG(NewPt->print(dbgs())); DEBUG(dbgs() << '\n'); - appendAndTransferDominatedUses(NewPt, Use, IPI, InsertPts); + appendAndTransferDominatedUses(NewPt, User, OpNo, IPI, InsertPts); return true; } return false; } -void AArch64PromoteConstant::computeInsertionPoints( - Constant *Val, InsertionPointsPerFunc &InsPtsPerFunc) { - DEBUG(dbgs() << "** Compute insertion points **\n"); - for (Use &Use : Val->uses()) { - Instruction *User = dyn_cast<Instruction>(Use.getUser()); - - // If the user is not an Instruction, we cannot modify it. - if (!User) - continue; - - // Filter out uses that should not be converted. - if (!shouldConvertUse(Val, User, Use.getOperandNo())) - continue; +void AArch64PromoteConstant::computeInsertionPoint( + Instruction *User, unsigned OpNo, InsertionPoints &InsertPts) { + DEBUG(dbgs() << "Considered use, opidx " << OpNo << ":\n"); + DEBUG(User->print(dbgs())); + DEBUG(dbgs() << '\n'); - DEBUG(dbgs() << "Considered use, opidx " << Use.getOperandNo() << ":\n"); - DEBUG(User->print(dbgs())); - DEBUG(dbgs() << '\n'); + Instruction *InsertionPoint = findInsertionPoint(*User, OpNo); - Instruction *InsertionPoint = findInsertionPoint(Use); + DEBUG(dbgs() << "Considered insertion point:\n"); + DEBUG(InsertionPoint->print(dbgs())); + DEBUG(dbgs() << '\n'); - DEBUG(dbgs() << "Considered insertion point:\n"); - DEBUG(InsertionPoint->print(dbgs())); - DEBUG(dbgs() << '\n'); + if (isDominated(InsertionPoint, User, OpNo, InsertPts)) + return; + // This insertion point is useful, check if we can merge some insertion + // point in a common dominator or if NewPt dominates an existing one. + if (tryAndMerge(InsertionPoint, User, OpNo, InsertPts)) + return; - // Check if the current insertion point is useless, i.e., it is dominated - // by another one. - InsertionPoints &InsertPts = - InsPtsPerFunc[InsertionPoint->getParent()->getParent()]; - if (isDominated(InsertionPoint, Use, InsertPts)) - continue; - // This insertion point is useful, check if we can merge some insertion - // point in a common dominator or if NewPt dominates an existing one. - if (tryAndMerge(InsertionPoint, Use, InsertPts)) - continue; - - DEBUG(dbgs() << "Keep considered insertion point\n"); + DEBUG(dbgs() << "Keep considered insertion point\n"); - // It is definitely useful by its own - InsertPts[InsertionPoint].push_back(&Use); - } + // It is definitely useful by its own + InsertPts[InsertionPoint].emplace_back(User, OpNo); } -bool AArch64PromoteConstant::insertDefinitions( - Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc) { - // We will create one global variable per Module. - DenseMap<Module *, GlobalVariable *> ModuleToMergedGV; - bool HasChanged = false; +static void ensurePromotedGV(Function &F, Constant &C, + AArch64PromoteConstant::PromotedConstant &PC) { + assert(PC.ShouldConvert && + "Expected that we should convert this to a global"); + if (PC.GV) + return; + PC.GV = new GlobalVariable( + *F.getParent(), C.getType(), true, GlobalValue::InternalLinkage, nullptr, + "_PromotedConst", nullptr, GlobalVariable::NotThreadLocal); + PC.GV->setInitializer(&C); + DEBUG(dbgs() << "Global replacement: "); + DEBUG(PC.GV->print(dbgs())); + DEBUG(dbgs() << '\n'); + ++NumPromoted; +} - // Traverse all insertion points in all the function. - for (const auto &FctToInstPtsIt : InsPtsPerFunc) { - const InsertionPoints &InsertPts = FctToInstPtsIt.second; -// Do more checking for debug purposes. +void AArch64PromoteConstant::insertDefinitions(Function &F, + GlobalVariable &PromotedGV, + InsertionPoints &InsertPts) { #ifndef NDEBUG - DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>( - *FctToInstPtsIt.first).getDomTree(); + // Do more checking for debug purposes. + DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree(); #endif - assert(!InsertPts.empty() && "Empty uses does not need a definition"); - - Module *M = FctToInstPtsIt.first->getParent(); - GlobalVariable *&PromotedGV = ModuleToMergedGV[M]; - if (!PromotedGV) { - PromotedGV = new GlobalVariable( - *M, Cst->getType(), true, GlobalValue::InternalLinkage, nullptr, - "_PromotedConst", nullptr, GlobalVariable::NotThreadLocal); - PromotedGV->setInitializer(Cst); - DEBUG(dbgs() << "Global replacement: "); - DEBUG(PromotedGV->print(dbgs())); - DEBUG(dbgs() << '\n'); - ++NumPromoted; - HasChanged = true; - } - - for (const auto &IPI : InsertPts) { - // Create the load of the global variable. - IRBuilder<> Builder(IPI.first); - LoadInst *LoadedCst = Builder.CreateLoad(PromotedGV); - DEBUG(dbgs() << "**********\n"); - DEBUG(dbgs() << "New def: "); - DEBUG(LoadedCst->print(dbgs())); - DEBUG(dbgs() << '\n'); + assert(!InsertPts.empty() && "Empty uses does not need a definition"); + + for (const auto &IPI : InsertPts) { + // Create the load of the global variable. + IRBuilder<> Builder(IPI.first); + LoadInst *LoadedCst = Builder.CreateLoad(&PromotedGV); + DEBUG(dbgs() << "**********\n"); + DEBUG(dbgs() << "New def: "); + DEBUG(LoadedCst->print(dbgs())); + DEBUG(dbgs() << '\n'); - // Update the dominated uses. - for (Use *Use : IPI.second) { + // Update the dominated uses. + for (auto Use : IPI.second) { #ifndef NDEBUG - assert(DT.dominates(LoadedCst, findInsertionPoint(*Use)) && - "Inserted definition does not dominate all its uses!"); + assert(DT.dominates(LoadedCst, + findInsertionPoint(*Use.first, Use.second)) && + "Inserted definition does not dominate all its uses!"); #endif - DEBUG(dbgs() << "Use to update " << Use->getOperandNo() << ":"); - DEBUG(Use->getUser()->print(dbgs())); - DEBUG(dbgs() << '\n'); - Use->set(LoadedCst); - ++NumPromotedUses; - } + DEBUG({ + dbgs() << "Use to update " << Use.second << ":"; + Use.first->print(dbgs()); + dbgs() << '\n'; + }); + Use.first->setOperand(Use.second, LoadedCst); + ++NumPromotedUses; } } - return HasChanged; } -bool AArch64PromoteConstant::computeAndInsertDefinitions(Constant *Val) { - InsertionPointsPerFunc InsertPtsPerFunc; - computeInsertionPoints(Val, InsertPtsPerFunc); - return insertDefinitions(Val, InsertPtsPerFunc); -} - -bool AArch64PromoteConstant::promoteConstant(Constant *Cst) { - assert(Cst && "Given variable is not a valid constant."); - - if (!shouldConvert(Cst)) - return false; - - DEBUG(dbgs() << "******************************\n"); - DEBUG(dbgs() << "Candidate constant: "); - DEBUG(Cst->print(dbgs())); - DEBUG(dbgs() << '\n'); - - return computeAndInsertDefinitions(Cst); +void AArch64PromoteConstant::promoteConstants( + Function &F, SmallVectorImpl<UpdateRecord> &Updates, + PromotionCacheTy &PromotionCache) { + // Promote the constants. + for (auto U = Updates.begin(), E = Updates.end(); U != E;) { + DEBUG(dbgs() << "** Compute insertion points **\n"); + auto First = U; + Constant *C = First->C; + InsertionPoints InsertPts; + do { + computeInsertionPoint(U->User, U->Op, InsertPts); + } while (++U != E && U->C == C); + + auto &Promotion = PromotionCache[C]; + ensurePromotedGV(F, *C, Promotion); + insertDefinitions(F, *Promotion.GV, InsertPts); + } } -bool AArch64PromoteConstant::runOnFunction(Function &F) { +bool AArch64PromoteConstant::runOnFunction(Function &F, + PromotionCacheTy &PromotionCache) { // Look for instructions using constant vector. Promote that constant to a // global variable. Create as few loads of this variable as possible and // update the uses accordingly. - bool LocalChange = false; - SmallPtrSet<Constant *, 8> AlreadyChecked; - + SmallVector<UpdateRecord, 64> Updates; for (Instruction &I : instructions(&F)) { // Traverse the operand, looking for constant vectors. Replace them by a // load of a global variable of constant vector type. - for (Value *Op : I.operand_values()) { - Constant *Cst = dyn_cast<Constant>(Op); + for (Use &U : I.operands()) { + Constant *Cst = dyn_cast<Constant>(U); // There is no point in promoting global values as they are already // global. Do not promote constant expressions either, as they may // require some code expansion. - if (Cst && !isa<GlobalValue>(Cst) && !isa<ConstantExpr>(Cst) && - AlreadyChecked.insert(Cst).second) - LocalChange |= promoteConstant(Cst); + if (!Cst || isa<GlobalValue>(Cst) || isa<ConstantExpr>(Cst)) + continue; + + // Check if this constant is worth promoting. + if (!shouldConvert(*Cst, PromotionCache)) + continue; + + // Check if this use should be promoted. + unsigned OpNo = &U - I.op_begin(); + if (!shouldConvertUse(Cst, &I, OpNo)) + continue; + + Updates.emplace_back(Cst, &I, OpNo); } } - return LocalChange; + + if (Updates.empty()) + return false; + + promoteConstants(F, Updates, PromotionCache); + return true; } diff --git a/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp new file mode 100644 index 0000000000000..60d8bbd260bb7 --- /dev/null +++ b/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp @@ -0,0 +1,182 @@ +//=- AArch64RedundantCopyElimination.cpp - Remove useless copy for AArch64 -=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +// This pass removes unnecessary zero copies in BBs that are targets of +// cbz/cbnz instructions. For instance, the copy instruction in the code below +// can be removed because the CBZW jumps to BB#2 when W0 is zero. +// BB#1: +// CBZW %W0, <BB#2> +// BB#2: +// %W0 = COPY %WZR +// This pass should be run after register allocation. +// +// FIXME: This should be extended to handle any constant other than zero. E.g., +// cmp w0, #1 +// b.eq .BB1 +// BB1: +// mov w0, #1 +// +// FIXME: This could also be extended to check the whole dominance subtree below +// the comparison if the compile time regression is acceptable. +// +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-copyelim" + +STATISTIC(NumCopiesRemoved, "Number of copies removed."); + +namespace llvm { +void initializeAArch64RedundantCopyEliminationPass(PassRegistry &); +} + +namespace { +class AArch64RedundantCopyElimination : public MachineFunctionPass { + const MachineRegisterInfo *MRI; + const TargetRegisterInfo *TRI; + +public: + static char ID; + AArch64RedundantCopyElimination() : MachineFunctionPass(ID) {} + bool optimizeCopy(MachineBasicBlock *MBB); + bool runOnMachineFunction(MachineFunction &MF) override; + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::AllVRegsAllocated); + } + const char *getPassName() const override { + return "AArch64 Redundant Copy Elimination"; + } +}; +char AArch64RedundantCopyElimination::ID = 0; +} + +INITIALIZE_PASS(AArch64RedundantCopyElimination, "aarch64-copyelim", + "AArch64 redundant copy elimination pass", false, false) + +static bool guaranteesZeroRegInBlock(MachineInstr &MI, MachineBasicBlock *MBB) { + unsigned Opc = MI.getOpcode(); + // Check if the current basic block is the target block to which the + // CBZ/CBNZ instruction jumps when its Wt/Xt is zero. + if ((Opc == AArch64::CBZW || Opc == AArch64::CBZX) && + MBB == MI.getOperand(1).getMBB()) + return true; + else if ((Opc == AArch64::CBNZW || Opc == AArch64::CBNZX) && + MBB != MI.getOperand(1).getMBB()) + return true; + + return false; +} + +bool AArch64RedundantCopyElimination::optimizeCopy(MachineBasicBlock *MBB) { + // Check if the current basic block has a single predecessor. + if (MBB->pred_size() != 1) + return false; + + MachineBasicBlock *PredMBB = *MBB->pred_begin(); + MachineBasicBlock::iterator CompBr = PredMBB->getLastNonDebugInstr(); + if (CompBr == PredMBB->end() || PredMBB->succ_size() != 2) + return false; + + ++CompBr; + do { + --CompBr; + if (guaranteesZeroRegInBlock(*CompBr, MBB)) + break; + } while (CompBr != PredMBB->begin() && CompBr->isTerminator()); + + // We've not found a CBZ/CBNZ, time to bail out. + if (!guaranteesZeroRegInBlock(*CompBr, MBB)) + return false; + + unsigned TargetReg = CompBr->getOperand(0).getReg(); + if (!TargetReg) + return false; + assert(TargetRegisterInfo::isPhysicalRegister(TargetReg) && + "Expect physical register"); + + // Remember all registers aliasing with TargetReg. + SmallSetVector<unsigned, 8> TargetRegs; + for (MCRegAliasIterator AI(TargetReg, TRI, true); AI.isValid(); ++AI) + TargetRegs.insert(*AI); + + bool Changed = false; + MachineBasicBlock::iterator LastChange = MBB->begin(); + unsigned SmallestDef = TargetReg; + // Remove redundant Copy instructions unless TargetReg is modified. + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) { + MachineInstr *MI = &*I; + ++I; + if (MI->isCopy() && MI->getOperand(0).isReg() && + MI->getOperand(1).isReg()) { + + unsigned DefReg = MI->getOperand(0).getReg(); + unsigned SrcReg = MI->getOperand(1).getReg(); + + if ((SrcReg == AArch64::XZR || SrcReg == AArch64::WZR) && + !MRI->isReserved(DefReg) && + (TargetReg == DefReg || TRI->isSuperRegister(DefReg, TargetReg))) { + DEBUG(dbgs() << "Remove redundant Copy : "); + DEBUG((MI)->print(dbgs())); + + MI->eraseFromParent(); + Changed = true; + LastChange = I; + NumCopiesRemoved++; + SmallestDef = + TRI->isSubRegister(SmallestDef, DefReg) ? DefReg : SmallestDef; + continue; + } + } + + if (MI->modifiesRegister(TargetReg, TRI)) + break; + } + + if (!Changed) + return false; + + // Otherwise, we have to fixup the use-def chain, starting with the + // CBZ/CBNZ. Conservatively mark as much as we can live. + CompBr->clearRegisterKills(SmallestDef, TRI); + + if (std::none_of(TargetRegs.begin(), TargetRegs.end(), + [&](unsigned Reg) { return MBB->isLiveIn(Reg); })) + MBB->addLiveIn(TargetReg); + + // Clear any kills of TargetReg between CompBr and the last removed COPY. + for (MachineInstr &MMI : + make_range(MBB->begin()->getIterator(), LastChange->getIterator())) + MMI.clearRegisterKills(SmallestDef, TRI); + + return true; +} + +bool AArch64RedundantCopyElimination::runOnMachineFunction( + MachineFunction &MF) { + if (skipFunction(*MF.getFunction())) + return false; + TRI = MF.getSubtarget().getRegisterInfo(); + MRI = &MF.getRegInfo(); + bool Changed = false; + for (MachineBasicBlock &MBB : MF) + Changed |= optimizeCopy(&MBB); + return Changed; +} + +FunctionPass *llvm::createAArch64RedundantCopyEliminationPass() { + return new AArch64RedundantCopyElimination(); +} diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp new file mode 100644 index 0000000000000..0a1831bd9a8ca --- /dev/null +++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp @@ -0,0 +1,168 @@ +//===- AArch64RegisterBankInfo.cpp -------------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the RegisterBankInfo class for +/// AArch64. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#include "AArch64RegisterBankInfo.h" +#include "AArch64InstrInfo.h" // For XXXRegClassID. +#include "llvm/CodeGen/GlobalISel/RegisterBank.h" +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +using namespace llvm; + +#ifndef LLVM_BUILD_GLOBAL_ISEL +#error "You shouldn't build this" +#endif + +AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) + : RegisterBankInfo(AArch64::NumRegisterBanks) { + // Initialize the GPR bank. + createRegisterBank(AArch64::GPRRegBankID, "GPR"); + // The GPR register bank is fully defined by all the registers in + // GR64all + its subclasses. + addRegBankCoverage(AArch64::GPRRegBankID, AArch64::GPR64allRegClassID, TRI); + const RegisterBank &RBGPR = getRegBank(AArch64::GPRRegBankID); + (void)RBGPR; + assert(RBGPR.covers(*TRI.getRegClass(AArch64::GPR32RegClassID)) && + "Subclass not added?"); + assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit"); + + // Initialize the FPR bank. + createRegisterBank(AArch64::FPRRegBankID, "FPR"); + // The FPR register bank is fully defined by all the registers in + // GR64all + its subclasses. + addRegBankCoverage(AArch64::FPRRegBankID, AArch64::QQQQRegClassID, TRI); + const RegisterBank &RBFPR = getRegBank(AArch64::FPRRegBankID); + (void)RBFPR; + assert(RBFPR.covers(*TRI.getRegClass(AArch64::QQRegClassID)) && + "Subclass not added?"); + assert(RBFPR.covers(*TRI.getRegClass(AArch64::FPR64RegClassID)) && + "Subclass not added?"); + assert(RBFPR.getSize() == 512 && + "FPRs should hold up to 512-bit via QQQQ sequence"); + + // Initialize the CCR bank. + createRegisterBank(AArch64::CCRRegBankID, "CCR"); + addRegBankCoverage(AArch64::CCRRegBankID, AArch64::CCRRegClassID, TRI); + const RegisterBank &RBCCR = getRegBank(AArch64::CCRRegBankID); + (void)RBCCR; + assert(RBCCR.covers(*TRI.getRegClass(AArch64::CCRRegClassID)) && + "Class not added?"); + assert(RBCCR.getSize() == 32 && "CCR should hold up to 32-bit"); + + assert(verify(TRI) && "Invalid register bank information"); +} + +unsigned AArch64RegisterBankInfo::copyCost(const RegisterBank &A, + const RegisterBank &B, + unsigned Size) const { + // What do we do with different size? + // copy are same size. + // Will introduce other hooks for different size: + // * extract cost. + // * build_sequence cost. + // TODO: Add more accurate cost for FPR to/from GPR. + return RegisterBankInfo::copyCost(A, B, Size); +} + +const RegisterBank &AArch64RegisterBankInfo::getRegBankFromRegClass( + const TargetRegisterClass &RC) const { + switch (RC.getID()) { + case AArch64::FPR8RegClassID: + case AArch64::FPR16RegClassID: + case AArch64::FPR32RegClassID: + case AArch64::FPR64RegClassID: + case AArch64::FPR128RegClassID: + case AArch64::FPR128_loRegClassID: + case AArch64::DDRegClassID: + case AArch64::DDDRegClassID: + case AArch64::DDDDRegClassID: + case AArch64::QQRegClassID: + case AArch64::QQQRegClassID: + case AArch64::QQQQRegClassID: + return getRegBank(AArch64::FPRRegBankID); + case AArch64::GPR32commonRegClassID: + case AArch64::GPR32RegClassID: + case AArch64::GPR32spRegClassID: + case AArch64::GPR32sponlyRegClassID: + case AArch64::GPR32allRegClassID: + case AArch64::GPR64commonRegClassID: + case AArch64::GPR64RegClassID: + case AArch64::GPR64spRegClassID: + case AArch64::GPR64sponlyRegClassID: + case AArch64::GPR64allRegClassID: + case AArch64::tcGPR64RegClassID: + case AArch64::WSeqPairsClassRegClassID: + case AArch64::XSeqPairsClassRegClassID: + return getRegBank(AArch64::GPRRegBankID); + case AArch64::CCRRegClassID: + return getRegBank(AArch64::CCRRegBankID); + default: + llvm_unreachable("Register class not supported"); + } +} + +RegisterBankInfo::InstructionMappings +AArch64RegisterBankInfo::getInstrAlternativeMappings( + const MachineInstr &MI) const { + switch (MI.getOpcode()) { + case TargetOpcode::G_OR: { + // 32 and 64-bit or can be mapped on either FPR or + // GPR for the same cost. + const MachineFunction &MF = *MI.getParent()->getParent(); + const TargetSubtargetInfo &STI = MF.getSubtarget(); + const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI); + if (Size != 32 && Size != 64) + break; + + // If the instruction has any implicit-defs or uses, + // do not mess with it. + if (MI.getNumOperands() != 3) + break; + InstructionMappings AltMappings; + InstructionMapping GPRMapping(/*ID*/ 1, /*Cost*/ 1, /*NumOperands*/ 3); + InstructionMapping FPRMapping(/*ID*/ 2, /*Cost*/ 1, /*NumOperands*/ 3); + for (unsigned Idx = 0; Idx != 3; ++Idx) { + GPRMapping.setOperandMapping(Idx, Size, + getRegBank(AArch64::GPRRegBankID)); + FPRMapping.setOperandMapping(Idx, Size, + getRegBank(AArch64::FPRRegBankID)); + } + AltMappings.emplace_back(std::move(GPRMapping)); + AltMappings.emplace_back(std::move(FPRMapping)); + return AltMappings; + } + default: + break; + } + return RegisterBankInfo::getInstrAlternativeMappings(MI); +} + +void AArch64RegisterBankInfo::applyMappingImpl( + const OperandsMapper &OpdMapper) const { + switch (OpdMapper.getMI().getOpcode()) { + case TargetOpcode::G_OR: { + // Those ID must match getInstrAlternativeMappings. + assert((OpdMapper.getInstrMapping().getID() == 1 || + OpdMapper.getInstrMapping().getID() == 2) && + "Don't know how to handle that ID"); + return applyDefaultMapping(OpdMapper); + } + default: + llvm_unreachable("Don't know how to handle that operation"); + } +} diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.h b/lib/Target/AArch64/AArch64RegisterBankInfo.h new file mode 100644 index 0000000000000..907bcfdea161b --- /dev/null +++ b/lib/Target/AArch64/AArch64RegisterBankInfo.h @@ -0,0 +1,69 @@ +//===- AArch64RegisterBankInfo -----------------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file declares the targeting of the RegisterBankInfo class for AArch64. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERBANKINFO_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERBANKINFO_H + +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" + +namespace llvm { + +class TargetRegisterInfo; + +namespace AArch64 { +enum { + GPRRegBankID = 0, /// General Purpose Registers: W, X. + FPRRegBankID = 1, /// Floating Point/Vector Registers: B, H, S, D, Q. + CCRRegBankID = 2, /// Conditional register: NZCV. + NumRegisterBanks +}; +} // End AArch64 namespace. + +/// This class provides the information for the target register banks. +class AArch64RegisterBankInfo : public RegisterBankInfo { + /// See RegisterBankInfo::applyMapping. + void applyMappingImpl(const OperandsMapper &OpdMapper) const override; + +public: + AArch64RegisterBankInfo(const TargetRegisterInfo &TRI); + /// Get the cost of a copy from \p B to \p A, or put differently, + /// get the cost of A = COPY B. Since register banks may cover + /// different size, \p Size specifies what will be the size in bits + /// that will be copied around. + /// + /// \note Since this is a copy, both registers have the same size. + unsigned copyCost(const RegisterBank &A, const RegisterBank &B, + unsigned Size) const override; + + /// Get a register bank that covers \p RC. + /// + /// \pre \p RC is a user-defined register class (as opposed as one + /// generated by TableGen). + /// + /// \note The mapping RC -> RegBank could be built while adding the + /// coverage for the register banks. However, we do not do it, because, + /// at least for now, we only need this information for register classes + /// that are used in the description of instruction. In other words, + /// there are just a handful of them and we do not want to waste space. + /// + /// \todo This should be TableGen'ed. + const RegisterBank & + getRegBankFromRegClass(const TargetRegisterClass &RC) const override; + + /// Get the alternative mappings for \p MI. + /// Alternative in the sense different from getInstrMapping. + InstructionMappings + getInstrAlternativeMappings(const MachineInstr &MI) const override; +}; +} // End llvm namespace. +#endif diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp index 32b4888f2f647..af867da4823d1 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -25,7 +25,6 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/IR/Function.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetOptions.h" @@ -51,6 +50,13 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR() ? CSR_AArch64_CXX_TLS_Darwin_PE_SaveList : CSR_AArch64_CXX_TLS_Darwin_SaveList; + if (MF->getSubtarget<AArch64Subtarget>().getTargetLowering() + ->supportSwiftError() && + MF->getFunction()->getAttributes().hasAttrSomewhere( + Attribute::SwiftError)) + return CSR_AArch64_AAPCS_SwiftError_SaveList; + if (MF->getFunction()->getCallingConv() == CallingConv::PreserveMost) + return CSR_AArch64_RT_MostRegs_SaveList; else return CSR_AArch64_AAPCS_SaveList; } @@ -74,6 +80,12 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, return CSR_AArch64_AllRegs_RegMask; if (CC == CallingConv::CXX_FAST_TLS) return CSR_AArch64_CXX_TLS_Darwin_RegMask; + if (MF.getSubtarget<AArch64Subtarget>().getTargetLowering() + ->supportSwiftError() && + MF.getFunction()->getAttributes().hasAttrSomewhere(Attribute::SwiftError)) + return CSR_AArch64_AAPCS_SwiftError_RegMask; + if (CC == CallingConv::PreserveMost) + return CSR_AArch64_RT_MostRegs_RegMask; else return CSR_AArch64_AAPCS_RegMask; } @@ -190,9 +202,7 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const { // If it's wrong, we'll materialize the constant and still get to the // object; it's just suboptimal. Negative offsets use the unscaled // load/store instructions, which have a 9-bit signed immediate. - if (MFI->getLocalFrameSize() < 256) - return false; - return true; + return MFI->getLocalFrameSize() >= 256; } return false; @@ -231,9 +241,7 @@ bool AArch64RegisterInfo::requiresFrameIndexScavenging( bool AArch64RegisterInfo::cannotEliminateFrame(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); - // Only consider eliminating leaf frames. - if (MFI->hasCalls() || (MF.getTarget().Options.DisableFramePointerElim(MF) && - MFI->adjustsStack())) + if (MF.getTarget().Options.DisableFramePointerElim(MF) && MFI->adjustsStack()) return true; return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken(); } @@ -396,8 +404,6 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true); } -namespace llvm { - unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { const AArch64FrameLowering *TFI = getFrameLowering(MF); @@ -437,5 +443,3 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, return 16; } } - -} // namespace llvm diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td index a8c8b176efa9f..5fbaff00a5e71 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/lib/Target/AArch64/AArch64RegisterInfo.td @@ -1,4 +1,4 @@ -//=- AArch64RegisterInfo.td - Describe the AArch64 Regisers --*- tablegen -*-=// +//=- AArch64RegisterInfo.td - Describe the AArch64 Registers -*- tablegen -*-=// // // The LLVM Compiler Infrastructure // diff --git a/lib/Target/AArch64/AArch64SchedA53.td b/lib/Target/AArch64/AArch64SchedA53.td index d709bee7b9eb4..93ca079275c8c 100644 --- a/lib/Target/AArch64/AArch64SchedA53.td +++ b/lib/Target/AArch64/AArch64SchedA53.td @@ -19,13 +19,13 @@ def CortexA53Model : SchedMachineModel { let MicroOpBufferSize = 0; // Explicitly set to zero since A53 is in-order. let IssueWidth = 2; // 2 micro-ops are dispatched per cycle. - let MinLatency = 1 ; // OperandCycles are interpreted as MinLatency. let LoadLatency = 3; // Optimistic load latency assuming bypass. // This is overriden by OperandCycles if the // Itineraries are queried instead. let MispredictPenalty = 9; // Based on "Cortex-A53 Software Optimisation // Specification - Instruction Timings" // v 1.0 Spreadsheet + let CompleteModel = 1; } @@ -109,6 +109,8 @@ def A53WriteVST2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5; def A53WriteVST3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6; let ResourceCycles = [3]; } +def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } + // Branch def : WriteRes<WriteBr, [A53UnitB]>; def : WriteRes<WriteBrReg, [A53UnitB]>; diff --git a/lib/Target/AArch64/AArch64SchedA57.td b/lib/Target/AArch64/AArch64SchedA57.td index ca4457af8525a..a266351f7ffc0 100644 --- a/lib/Target/AArch64/AArch64SchedA57.td +++ b/lib/Target/AArch64/AArch64SchedA57.td @@ -30,6 +30,7 @@ def CortexA57Model : SchedMachineModel { // Enable partial & runtime unrolling. The magic number is chosen based on // experiments and benchmarking data. let LoopMicroOpBufferSize = 16; + let CompleteModel = 1; } //===----------------------------------------------------------------------===// @@ -96,6 +97,8 @@ def : SchedAlias<WriteV, A57Write_3cyc_1V>; def : SchedAlias<WriteVLD, A57Write_5cyc_1L>; def : SchedAlias<WriteVST, A57Write_1cyc_1S>; +def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } + def : WriteRes<WriteSys, []> { let Latency = 1; } def : WriteRes<WriteBarrier, []> { let Latency = 1; } def : WriteRes<WriteHint, []> { let Latency = 1; } diff --git a/lib/Target/AArch64/AArch64SchedCyclone.td b/lib/Target/AArch64/AArch64SchedCyclone.td index a2a1802377894..9fd3ae6818e5d 100644 --- a/lib/Target/AArch64/AArch64SchedCyclone.td +++ b/lib/Target/AArch64/AArch64SchedCyclone.td @@ -1,4 +1,4 @@ -//=- ARMSchedCyclone.td - AArch64 Cyclone Scheduling Defs ----*- tablegen -*-=// +//=- AArch64SchedCyclone.td - Cyclone Scheduling Definitions -*- tablegen -*-=// // // The LLVM Compiler Infrastructure // @@ -17,6 +17,7 @@ def CycloneModel : SchedMachineModel { let MicroOpBufferSize = 192; // Based on the reorder buffer. let LoadLatency = 4; // Optimistic load latency. let MispredictPenalty = 16; // 14-19 cycles are typical. + let CompleteModel = 1; } //===----------------------------------------------------------------------===// @@ -107,7 +108,7 @@ def WriteX : SchedWriteRes<[]> { let Latency = 0; } // The move is replaced by a single nop micro-op. // MOVZ Rd, #0 // AND Rd, Rzr, #imm -def WriteZPred : SchedPredicate<[{TII->isGPRZero(MI)}]>; +def WriteZPred : SchedPredicate<[{TII->isGPRZero(*MI)}]>; def WriteImmZ : SchedWriteVariant<[ SchedVar<WriteZPred, [WriteX]>, SchedVar<NoSchedPred, [WriteImm]>]>; @@ -116,8 +117,8 @@ def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>; // Move GPR is a register rename and single nop micro-op. // ORR Xd, XZR, Xm // ADD Xd, Xn, #0 -def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(MI)}]>; -def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(MI)}]>; +def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(*MI)}]>; +def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(*MI)}]>; def WriteMov : SchedWriteVariant<[ SchedVar<WriteIMovPred, [WriteX]>, SchedVar<WriteVMovPred, [WriteX]>, @@ -726,7 +727,7 @@ def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV], def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV], (instrs LD3Rv1d,LD3Rv2d)>; def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV], - (instrs LD3Rv2d_POST,LD3Rv2d_POST)>; + (instrs LD3Rv1d_POST,LD3Rv2d_POST)>; def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV], (instregex "LD4Fourv(8b|4h|2s)$")>; @@ -851,6 +852,9 @@ def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>; def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], (instrs ST4i64)>; def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>; +// Atomic operations are not supported. +def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } + //--- // Unused SchedRead types //--- diff --git a/lib/Target/AArch64/AArch64SchedKryo.td b/lib/Target/AArch64/AArch64SchedKryo.td new file mode 100644 index 0000000000000..4e491a04c78df --- /dev/null +++ b/lib/Target/AArch64/AArch64SchedKryo.td @@ -0,0 +1,133 @@ +//==- AArch64SchedKryo.td - Qualcomm Kryo Scheduling Defs ---*- tablegen -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Qualcomm Kryo to support +// instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// The issue width is set to five, matching the five issue queues for expanded +// uops. Now, the latency spreadsheet has information based on fragmented uops, +// but these do not actually take up an issue queue. + +def KryoModel : SchedMachineModel { + let IssueWidth = 5; // 5-wide issue for expanded uops + let MicroOpBufferSize = 128; // Out-of-order with temporary unified issue buffer + let LoadLatency = 4; // Optimistic load latency + let MispredictPenalty = 14; // Fetch + Decode/Rename/Dispatch + Branch + + // Enable partial & runtime unrolling. The magic number is chosen based on + // experiments and benchmarking data. + let LoopMicroOpBufferSize = 16; + let CompleteModel = 1; +} + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available on Kryo. + +let SchedModel = KryoModel in { + def KryoUnitXA : ProcResource<1>; // Type X(A) micro-ops + def KryoUnitXB : ProcResource<1>; // Type X(B) micro-ops + def KryoUnitYA : ProcResource<1>; // Type Y(A) micro-ops + def KryoUnitYB : ProcResource<1>; // Type Y(B) micro-ops + def KryoUnitX : ProcResGroup<[KryoUnitXA, // Type X micro-ops + KryoUnitXB]>; + def KryoUnitY : ProcResGroup<[KryoUnitYA, // Type Y micro-ops + KryoUnitYB]>; + def KryoUnitXY : ProcResGroup<[KryoUnitXA, // Type XY micro-ops + KryoUnitXB, + KryoUnitYA, + KryoUnitYB]>; + def KryoUnitLSA : ProcResource<1>; // Type LS(A) micro-ops + def KryoUnitLSB : ProcResource<1>; // Type LS(B) micro-ops + def KryoUnitLS : ProcResGroup<[KryoUnitLSA, // Type LS micro-ops + KryoUnitLSB]>; +} + +let SchedModel = KryoModel in { + +//===----------------------------------------------------------------------===// +// Map the target-defined scheduler read/write resources and latency for +// Kryo. + +def : WriteRes<WriteImm, [KryoUnitXY]> { let Latency = 1; } +def : WriteRes<WriteI, [KryoUnitXY]> { let Latency = 1; } +def : WriteRes<WriteISReg, [KryoUnitXY, KryoUnitXY]> + { let Latency = 2; let NumMicroOps = 2; } +def : WriteRes<WriteIEReg, [KryoUnitXY, KryoUnitXY]> + { let Latency = 2; let NumMicroOps = 2; } +def : WriteRes<WriteExtr, [KryoUnitXY, KryoUnitX]> + { let Latency = 2; let NumMicroOps = 2; } +def : WriteRes<WriteIS, [KryoUnitXY]> { let Latency = 2; } +def : WriteRes<WriteID32, [KryoUnitXA, KryoUnitY]> + { let Latency = 8; let NumMicroOps = 1; } // Fragent -1 +def : WriteRes<WriteID64, [KryoUnitXA, KryoUnitY]> + { let Latency = 8; let NumMicroOps = 1; } // Fragent -1 +def : WriteRes<WriteIM32, [KryoUnitX]> { let Latency = 5; } +def : WriteRes<WriteIM64, [KryoUnitX]> { let Latency = 5; } +def : WriteRes<WriteBr, [KryoUnitXY]> { let Latency = 1; } +def : WriteRes<WriteBrReg, [KryoUnitXY]> { let Latency = 1; } +def : WriteRes<WriteLD, [KryoUnitLS]> { let Latency = 4; } +def : WriteRes<WriteST, [KryoUnitLS]> { let Latency = 4; } +def : WriteRes<WriteSTP, [KryoUnitLS]> { let Latency = 4; } +def : WriteRes<WriteAdr, [KryoUnitXY]> { let Latency = 6; } +def : WriteRes<WriteLDIdx, [KryoUnitLS]> { let Latency = 4; } +def : WriteRes<WriteSTIdx, [KryoUnitLS]> { let Latency = 4; } +def : WriteRes<WriteF, [KryoUnitXY, KryoUnitXY]> + { let Latency = 3; let NumMicroOps = 2; } +def : WriteRes<WriteFCmp, [KryoUnitXY]> { let Latency = 2; } +def : WriteRes<WriteFCvt, [KryoUnitX]> { let Latency = 4; } +def : WriteRes<WriteFCopy, [KryoUnitXY]> { let Latency = 6; } +def : WriteRes<WriteFImm, [KryoUnitXY]> { let Latency = 6; } +def : WriteRes<WriteFMul, [KryoUnitX, KryoUnitX]> + { let Latency = 6; let NumMicroOps = 2; } +def : WriteRes<WriteFDiv, [KryoUnitXA, KryoUnitY]> + { let Latency = 12; let NumMicroOps = 2; } // Fragent -1 / NoRSV +1 +def : WriteRes<WriteV, [KryoUnitXY]> { let Latency = 6; } +def : WriteRes<WriteVLD, [KryoUnitLS]> { let Latency = 4; } +def : WriteRes<WriteVST, [KryoUnitLS]> { let Latency = 4; } + +def : WriteRes<WriteSys, []> { let Latency = 1; } +def : WriteRes<WriteBarrier, []> { let Latency = 1; } +def : WriteRes<WriteHint, []> { let Latency = 1; } + +def : WriteRes<WriteLDHi, []> { let Latency = 4; } + +def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } + +// No forwarding logic is modelled yet. +def : ReadAdvance<ReadI, 0>; +def : ReadAdvance<ReadISReg, 0>; +def : ReadAdvance<ReadIEReg, 0>; +def : ReadAdvance<ReadIM, 0>; +def : ReadAdvance<ReadIMA, 0>; +def : ReadAdvance<ReadID, 0>; +def : ReadAdvance<ReadExtrHi, 0>; +def : ReadAdvance<ReadAdrBase, 0>; +def : ReadAdvance<ReadVLD, 0>; + + +//===----------------------------------------------------------------------===// +// Specialize the coarse model by associating instruction groups with the +// subtarget-defined types. As the modeled is refined, this will override most +// of the above SchedWriteRes and SchedAlias mappings. + +// Miscellaneous +// ----------------------------------------------------------------------------- + +def : InstRW<[WriteI], (instrs COPY)>; + + +// Detailed Refinedments +// ----------------------------------------------------------------------------- +include "AArch64SchedKryoDetails.td" + + +} // SchedModel = KryoModel diff --git a/lib/Target/AArch64/AArch64SchedKryoDetails.td b/lib/Target/AArch64/AArch64SchedKryoDetails.td new file mode 100644 index 0000000000000..426ae6103e4b5 --- /dev/null +++ b/lib/Target/AArch64/AArch64SchedKryoDetails.td @@ -0,0 +1,2358 @@ +//=- AArch64SchedKryoDetails.td - QC Kryo Scheduling Defs ----*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the uop and latency details for the machine model for the +// Qualcomm Kryo subtarget. +// +//===----------------------------------------------------------------------===// + +def KryoWrite_3cyc_X_noRSV_138ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_X_noRSV_138ln], + (instregex "(S|U)R?SRA(d|(v2i32|v4i16|v8i8)_shift)")>; + +def KryoWrite_3cyc_X_X_139ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_X_X_139ln], + (instregex "(S|U)R?SRA(v2i64|v4i32|v8i16|v16i8)_shift")>; + +def KryoWrite_4cyc_XY_XY_noRSV_172ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 4; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_4cyc_XY_XY_noRSV_172ln], + (instregex "(S|U)ABA(v8i8|v4i16|v2i32)")>; +def KryoWrite_4cyc_XY_XY_XY_XY_178ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY, KryoUnitXY]> { + let Latency = 4; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_4cyc_XY_XY_XY_XY_178ln], + (instregex "(S|U)ABA(v16i8|v8i16|v4i32)")>; +def KryoWrite_3cyc_XY_XY_XY_XY_177ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_XY_XY_177ln], + (instregex "(S|U)ABALv.*")>; +def KryoWrite_3cyc_XY_XY_166ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_166ln], + (instregex "(S|U)(ABD|QSUB|RHADD)(v16i8|v8i16|v4i32|v2i64)")>; +def KryoWrite_3cyc_XY_noRSV_159ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_159ln], + (instregex "(S|U)(ABD|RHADD)(v8i8|v4i16|v2i32)")>; +def KryoWrite_3cyc_XY_XY_165ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_165ln], + (instregex "(S|U)ABDLv.*")>; +def KryoWrite_3cyc_X_noRSV_154ln : + SchedWriteRes<[KryoUnitX]> { +let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_X_noRSV_154ln], + (instregex "(S|U)ADALP(v8i8|v4i16|v2i32)_v.*")>; +def KryoWrite_3cyc_X_X_155ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_X_X_155ln], + (instregex "(S|U)ADALP(v16i8|v8i16|v4i32)_v.*")>; +def KryoWrite_2cyc_XY_XY_151ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_151ln], + (instregex "(S|U)(ADD|SUB)Lv.*")>; +def KryoWrite_2cyc_XY_noRSV_148ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_148ln], + (instregex "((S|U)ADDLP|ABS)(v2i32|v4i16|v8i8)(_v.*)?")>; +def KryoWrite_2cyc_XY_XY_150ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_150ln], + (instregex "((S|U)ADDLP|ABS)(v2i64|v4i32|v8i16|v16i8)(_v.*)?")>; +def KryoWrite_3cyc_XY_XY_XY_noRSV_179ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_XY_noRSV_179ln], + (instrs SADDLVv4i32v, UADDLVv4i32v)>; +def KryoWrite_5cyc_XY_XY_XY_noRSV_180ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY]> { + let Latency = 5; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_5cyc_XY_XY_XY_noRSV_180ln], + (instrs SADDLVv8i16v, UADDLVv8i16v)>; +def KryoWrite_6cyc_XY_XY_X_noRSV_181ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitX]> { + let Latency = 6; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_6cyc_XY_XY_X_noRSV_181ln], + (instrs SADDLVv16i8v, UADDLVv16i8v)>; +def KryoWrite_3cyc_XY_noRSV_158ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_158ln], + (instrs SADDLVv4i16v, UADDLVv4i16v, ADDVv4i16v)>; +def KryoWrite_4cyc_X_noRSV_169ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_noRSV_169ln], + (instrs SADDLVv8i8v, UADDLVv8i8v, ADDVv8i8v)>; +def KryoWrite_2cyc_XY_XY_XY_XY_176ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_XY_XY_176ln], + (instregex "(S|U)(ADDW|SUBW)v.*")>; +def KryoWrite_4cyc_X_noRSV_40ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_noRSV_40ln], + (instregex "(S|U)CVTFS(W|X)(D|S)ri")>; +def KryoWrite_4cyc_X_noRSV_97ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_noRSV_97ln], + (instregex "(S|U)CVTFU(W|X)(D|S)ri")>; +def KryoWrite_4cyc_X_noRSV_110ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_noRSV_110ln], + (instregex "(S|U)CVTF(v1i32|v2i32|v1i64|v2f32|d|s)(_shift)?")>; +def KryoWrite_4cyc_X_X_114ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_X_114ln], + (instregex "(S|U)CVTF(v2i64|v4i32|v2f64|v4f32)(_shift)?")>; +def KryoWrite_1cyc_XA_Y_98ln : + SchedWriteRes<[KryoUnitXA, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XA_Y_98ln], + (instregex "(S|U)DIV(_Int)?(W|X)r")>; +def KryoWrite_2cyc_XY_XY_152ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_152ln], + (instregex "(S|U)H(ADD|SUB)(v16i8|v8i16|v4i32)")>; +def KryoWrite_2cyc_XY_noRSV_149ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_149ln], + (instregex "((S|U)H(ADD|SUB)|ADDP)(v8i8|v4i16|v2i32)")>; +def KryoWrite_4cyc_X_70ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_4cyc_X_70ln], + (instregex "(S|U)(MADDL|MSUBL)rrr")>; +def KryoWrite_4cyc_X_X_191ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_X_191ln], + (instregex "(S|U|SQD)(MLAL|MLSL|MULL)v.*")>; +def KryoWrite_1cyc_XY_195ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_XY_195ln], + (instregex "(S|U)MOVv.*")>; +def KryoWrite_5cyc_X_71ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 5; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_5cyc_X_71ln], + (instrs SMULHrr, UMULHrr)>; +def KryoWrite_3cyc_XY_noRSV_186ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_186ln], + (instregex "^(S|U)QADD(v8i8|v4i16|v2i32)")>; +def KryoWrite_3cyc_XY_XY_187ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_187ln], + (instregex "^(S|U)QADD(v16i8|v8i16|v4i32|v2i64)")>; +def KryoWrite_3cyc_XY_noRSV_69ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_69ln], + (instregex "(S|U|SU|US)QADD(v1i8|v1i16|v2i16|v1i32|v1i64)")>; +def KryoWrite_3cyc_XY_noRSV_248ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_248ln], + (instregex "(S|U)QSHLU?(d|s|h|b|(v8i8|v4i16|v2i32)_shift)$")>; +def KryoWrite_3cyc_XY_XY_250ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_250ln], + (instregex "(S|U)(QSHLU?|RSHR)(v16i8|v8i16|v4i32|v2i64)_shift$")>; +def KryoWrite_3cyc_XY_noRSV_246ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_246ln], + (instregex "(S|U)(QSHL|RSHL|QRSHL)(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32)$")>; +def KryoWrite_3cyc_XY_XY_251ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_251ln], + (instregex "(S|U)(QSHL|RSHL|QRSHL)(v16i8|v8i16|v4i32|v2i64)$")>; +def KryoWrite_6cyc_XY_X_238ln : + SchedWriteRes<[KryoUnitXY, KryoUnitX]> { + let Latency = 6; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_6cyc_XY_X_238ln], + (instregex "((S|U)QR?SHRN|SQR?SHRUN)(v16i8|v8i16|v4i32)_shift$")>; +def KryoWrite_3cyc_XY_noRSV_249ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_249ln], + (instregex "((S|U)QR?SHRN|SQR?SHRUN)(s|h|b)?")>; +def KryoWrite_6cyc_XY_X_noRSV_252ln : + SchedWriteRes<[KryoUnitXY, KryoUnitX]> { + let Latency = 6; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_6cyc_XY_X_noRSV_252ln], + (instregex "((S|U)QR?SHRN|SQR?SHRUN)(v8i8|v4i16|v2i32)_shift?")>; +def KryoWrite_3cyc_XY_noRSV_161ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_161ln], + (instregex "(S|U)QSUB(v8i8|v4i16|v2i32|v1i64|v1i32|v1i16|v1i8)")>; +def KryoWrite_3cyc_XY_noRSV_163ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_163ln], + (instregex "(S|U)QXTU?N(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)")>; +def KryoWrite_3cyc_XY_noRSV_162ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_162ln], + (instregex "(S|U)QXTU?N(v1i8|v1i16|v1i32)")>; +def KryoWrite_3cyc_XY_noRSV_247ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_247ln], + (instregex "(S|U)RSHR(d|(v8i8|v4i16|v2i32)_shift)$")>; +def KryoWrite_2cyc_XY_noRSV_239ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_239ln], + (instregex "(S|U)SHL(d|v8i8|v4i16|v2i32|v1i64)$")>; +def KryoWrite_2cyc_XY_XY_243ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_243ln], + (instregex "(S|U)SHL(v16i8|v8i16|v4i32|v2i64)$")>; +def KryoWrite_2cyc_XY_XY_241ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_241ln], + (instregex "(S|U)?SHLL(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)(_shift)?$")>; +def KryoWrite_2cyc_XY_noRSV_240ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_240ln], + (instregex "((S|U)SHR|SHL)(d|(v8i8|v4i16|v2i32)_shift)$")>; +def KryoWrite_2cyc_XY_XY_242ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_242ln], + (instregex "((S|U)SHR|SHL)(v16i8|v8i16|v4i32|v2i64)_shift$")>; +def KryoWrite_2cyc_XY_XY_183ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_183ln], + (instregex "(S|U)(MAX|MIN)P?(v16i8|v8i16|v4i32)")>; +def KryoWrite_2cyc_XY_noRSV_182ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_182ln], + (instregex "(S|U)(MAX|MIN)P?(v8i8|v4i16|v2i32)")>; +def KryoWrite_3cyc_XY_noRSV_184ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_184ln], + (instregex "(S|U)(MAX|MIN)V(v4i16v|v8i8v|v4i32)")>; +def KryoWrite_4cyc_X_noRSV_185ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_noRSV_185ln], + (instregex "(S|U)(MAX|MIN)V(v16i8v|v8i16v)")>; +def KryoWrite_2cyc_XY_noRSV_67ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_67ln], + (instrs ABSv1i64)>; +def KryoWrite_1cyc_XY_63ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_XY_63ln, ReadI, ReadI], + (instregex "ADC.*")>; +def KryoWrite_1cyc_XY_63_1ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_XY_63_1ln], + (instregex "ADR.*")>; +def KryoWrite_1cyc_XY_62ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_XY_62ln, ReadI], + (instregex "ADDS?(W|X)ri")>; +def KryoWrite_2cyc_XY_XY_64ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_64ln, ReadI, ReadI], + (instregex "ADDS?(W|X)r(r|s|x)(64)?")>; +def KryoWrite_1cyc_XY_noRSV_65ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_65ln], + (instrs ADDv1i64)>; +def KryoWrite_1cyc_XY_noRSV_144ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_144ln], + (instregex "(ADD|SUB)(v8i8|v4i16|v2i32|v1i64)")>; +def KryoWrite_1cyc_XY_XY_146ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_XY_146ln], + (instregex "(ADD|SUB)(v16i8|v8i16|v4i32|v2i64)")>; +def KryoWrite_4cyc_XY_X_noRSV_171ln : + SchedWriteRes<[KryoUnitXY, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_4cyc_XY_X_noRSV_171ln], + (instregex "(ADD|SUB)HNv.*")>; +def KryoWrite_1cyc_XY_noRSV_66ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_66ln], + (instrs ADDPv2i64p)>; +def KryoWrite_2cyc_XY_XY_153ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_153ln], + (instregex "ADDP(v16i8|v8i16|v4i32|v2i64)")>; +def KryoWrite_3cyc_XY_XY_noRSV_170ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_noRSV_170ln], + (instrs ADDVv4i32v)>; +def KryoWrite_4cyc_XY_XY_noRSV_173ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 4; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_4cyc_XY_XY_noRSV_173ln], + (instrs ADDVv8i16v)>; +def KryoWrite_5cyc_XY_X_noRSV_174ln : + SchedWriteRes<[KryoUnitXY, KryoUnitX]> { + let Latency = 5; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_5cyc_XY_X_noRSV_174ln], + (instrs ADDVv16i8v)>; +def KryoWrite_3cyc_XY_XY_X_X_27ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitX, KryoUnitX]> { + let Latency = 3; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_X_X_27ln], + (instrs AESDrr, AESErr)>; +def KryoWrite_2cyc_X_X_22ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_X_X_22ln], + (instrs AESIMCrr, AESMCrr)>; +def KryoWrite_1cyc_XY_noRSV_76ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_76ln], + (instregex "((AND|ORN|EOR|EON)S?(Wr[rsi]|v8i8|v4i16|v2i32)|(ORR|BIC)S?(Wr[rs]|v8i8|v4i16|v2i32))")>; +def KryoWrite_1cyc_XY_XY_79ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_XY_79ln], + (instregex "((AND|ORN|EOR|EON)S?(Xr[rsi]|v16i8|v8i16|v4i32)|(ORR|BIC)S?(Xr[rs]|v16i8|v8i16|v4i32))")>; +def KryoWrite_1cyc_X_72ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_X_72ln], + (instregex "(S|U)?BFM.*")>; +def KryoWrite_1cyc_XY_noRSV_77ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_77ln], + (instregex "(BIC|ORR)S?Wri")>; +def KryoWrite_1cyc_XY_XY_78ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_XY_78ln], + (instregex "(BIC|ORR)S?Xri")>; +def KryoWrite_1cyc_X_noRSV_74ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_X_noRSV_74ln], + (instrs BIFv8i8, BITv8i8, BSLv8i8)>; +def KryoWrite_1cyc_X_X_75ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_X_X_75ln], + (instrs BIFv16i8, BITv16i8, BSLv16i8)>; +def KryoWrite_0cyc_noRSV_11ln : + SchedWriteRes<[]> { + let Latency = 0; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_0cyc_noRSV_11ln], + (instrs BRK, DCPS1, DCPS2, DCPS3, HLT, HVC, ISB, HINT, SMC, SVC)>; +def KryoWrite_0cyc_XY_16ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 0; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_0cyc_XY_16ln, ReadI], + (instregex "(CCMN|CCMP)(W|X)i")>; +def KryoWrite_0cyc_XY_16_1ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 0; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_0cyc_XY_16_1ln, ReadI, ReadI], + (instregex "(CCMN|CCMP)(W|X)r")>; +def KryoWrite_2cyc_XY_3ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_2cyc_XY_3ln, ReadI], + (instregex "(CLS|CLZ)(W|X)r")>; +def KryoWrite_2cyc_XY_noRSV_7ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_7ln], + (instregex "(CLS|CLZ|CNT)(v4i32|v8i16|v16i8)")>; +def KryoWrite_2cyc_XY_XY_8ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_8ln], + (instregex "(CLS|CLZ|CNT)(v2i32|v4i16|v8i8)")>; +def KryoWrite_2cyc_XY_noRSV_80ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_80ln], + (instregex "CM(EQ|GE|HS|GT|HI|TST)(v8i8|v4i16|v2i32|v1i64)$")>; +def KryoWrite_2cyc_XY_XY_83ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_83ln], + (instregex "CM(EQ|GE|HS|GT|HI|TST)(v16i8|v8i16|v4i32|v2i64)$")>; +def KryoWrite_2cyc_XY_noRSV_81ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_81ln], + (instregex "CM(EQ|LE|GE|GT|LT)(v8i8|v4i16|v2i32|v1i64)rz$")>; +def KryoWrite_2cyc_XY_XY_82ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_82ln], + (instregex "CM(EQ|LE|GE|GT|LT)(v16i8|v8i16|v4i32|v2i64)rz$")>; +def KryoWrite_3cyc_XY_4ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_3cyc_XY_4ln, ReadI, ReadISReg], + (instregex "CRC32.*")>; +def KryoWrite_1cyc_XY_20ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_XY_20ln, ReadI, ReadI], + (instregex "CSEL(W|X)r")>; +def KryoWrite_1cyc_X_17ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_X_17ln, ReadI, ReadI], + (instregex "(CSINC|CSNEG)(W|X)r")>; +def KryoWrite_1cyc_XY_18ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_XY_18ln, ReadI, ReadI], + (instregex "(CSINV)(W|X)r")>; +def KryoWrite_3cyc_LS_X_13ln : + SchedWriteRes<[KryoUnitLS, KryoUnitX]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_X_13ln], + (instrs DRPS)>; +def KryoWrite_0cyc_LS_10ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 0; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_0cyc_LS_10ln], + (instrs DSB, DMB, CLREX)>; +def KryoWrite_1cyc_X_noRSV_196ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_X_noRSV_196ln], + (instregex "DUP(v8i8|v4i16|v2i32)(gpr|lane)")>; +def KryoWrite_1cyc_X_X_197ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_X_X_197ln], + (instregex "DUP(v16i8|v8i16|v4i32|v2i64)(gpr|lane)")>; +def KryoWrite_3cyc_LS_LS_X_15ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_X_15ln], + (instrs ERET)>; +def KryoWrite_1cyc_X_noRSV_207ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_X_noRSV_207ln], + (instrs EXTv8i8)>; +def KryoWrite_1cyc_X_X_212ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_X_X_212ln], + (instrs EXTv16i8)>; +def KryoWrite_2cyc_XY_X_136ln : + SchedWriteRes<[KryoUnitXY, KryoUnitX]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_X_136ln], + (instrs EXTRWrri, EXTRXrri)>; +def KryoWrite_2cyc_XY_noRSV_35ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_35ln], + (instregex "F(MAX|MIN)(NM)?P?(D|S)rr")>; +def KryoWrite_2cyc_XY_XY_106ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_106ln], + (instregex "(F(MAX|MIN)(NM)?P?|FAC(GE|GT)|FCM(EQ|GE|GT))(v2i64p|v2f64|v4f32)")>; +def KryoWrite_2cyc_XY_noRSV_104ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_104ln], + (instregex "(F(MAX|MIN)(NM)?P?|FAC(GE|GT)|FCM(EQ|GE|GT))(v2f32|v2i32p)")>; +def KryoWrite_3cyc_XY_noRSV_107ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_107ln], + (instregex "F(MAX|MIN)(NM)?Vv4i32v")>; +def KryoWrite_3cyc_XY_noRSV_101ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_101ln], + (instregex "FABD(32|64|v2f32)")>; +def KryoWrite_3cyc_XY_XY_103ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_103ln], + (instregex "(FABD|FADD|FSUB|FADDP)(v4f32|v2f64)")>; +def KryoWrite_1cyc_XY_noRSV_48ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_48ln], + (instregex "F(ABS|NEG)(D|S)r")>; +def KryoWrite_1cyc_XY_noRSV_124ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_124ln], + (instregex "F(ABS|NEG)v2f32")>; +def KryoWrite_1cyc_XY_XY_125ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_XY_125ln], + (instregex "F(ABS|NEG)(v2f64|v4f32)")>; +def KryoWrite_2cyc_XY_noRSV_33ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_33ln], + (instregex "(FAC(GE|GT)|FCM(EQ|GE|GT))(32|64)")>; +def KryoWrite_3cyc_XY_noRSV_30ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_30ln], + (instregex "(FADD|FSUB)(D|S)rr")>; +def KryoWrite_3cyc_XY_noRSV_100ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_100ln], + (instregex "(FADD|FSUB|FADDP)v2f32")>; +def KryoWrite_3cyc_XY_noRSV_29ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_29ln], + (instregex "FADDP(v2i32p|v2i64p)")>; +def KryoWrite_0cyc_XY_31ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 0; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_0cyc_XY_31ln], + (instregex "FCCMPE?(D|S)rr")>; +def KryoWrite_2cyc_XY_noRSV_34ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_34ln], + (instregex "FCM(EQ|LE|GE|GT|LT)(v1i32|v1i64)rz")>; +def KryoWrite_2cyc_XY_XY_36ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_36ln], + (instregex "FCM(EQ|LE|GE|GT|LT)(v2i64|v4i32)rz")>; +def KryoWrite_2cyc_XY_noRSV_105ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_105ln], + (instregex "FCM(EQ|LE|GE|GT|LT)v2i32rz")>; +def KryoWrite_0cyc_XY_32ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 0; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_0cyc_XY_32ln], + (instregex "FCMPE?(D|S)r(r|i)")>; +def KryoWrite_1cyc_XY_noRSV_49ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_49ln], + (instrs FCSELDrrr, FCSELSrrr)>; +def KryoWrite_4cyc_X_noRSV_41ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_noRSV_41ln], + (instrs FCVTDHr, FCVTDSr, FCVTHDr, FCVTHSr, FCVTSDr, FCVTSHr)>; +def KryoWrite_4cyc_X_38ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_4cyc_X_38ln], + (instregex "FCVT(((A|N|M|P)(S|U)(S|U)|Z(S|U)_Int(S|U))(W|X)(D|S)ri?|Z(S|U)(d|s))$")>; +def KryoWrite_4cyc_X_noRSV_113ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_noRSV_113ln], + (instregex "FCVT((A|N|M|P)(S|U)|Z(S|U)_Int)(v1i32|v1i64|v2f32)$")>; +def KryoWrite_4cyc_X_X_117ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_X_117ln], + (instregex "FCVT((A|N|M|P)(S|U)|Z(S|U)_Int)(v4f32|v2f64)$")>; +def KryoWrite_5cyc_X_X_XY_noRSV_119ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitXY]> { + let Latency = 5; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_5cyc_X_X_XY_noRSV_119ln], + (instregex "FCVTX?N(v2f32|v4f32|v2i32|v4i16|v4i32|v8i16)$")>; +def KryoWrite_4cyc_X_X_116ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_X_116ln], + (instregex "FCVTL(v2i32|v4i16|v4i32|v8i16)$")>; +def KryoWrite_4cyc_X_noRSV_112ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_noRSV_112ln], + (instrs FCVTXNv1i64)>; +def KryoWrite_4cyc_X_37ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_4cyc_X_37ln], + (instregex "FCVTZ(S|U)(S|U)(W|X)(D|S)ri?$")>; +def KryoWrite_4cyc_X_noRSV_111ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_noRSV_111ln], + (instregex "FCVTZ(S|U)(v2f32|v1i32|v1i64|v2i32(_shift)?)$")>; +def KryoWrite_4cyc_X_X_115ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_X_115ln], + (instregex "FCVTZ(S|U)(v2f64|v4f32|(v2i64|v4i32)(_shift)?)$")>; +def KryoWrite_1cyc_XA_Y_noRSV_43ln : + SchedWriteRes<[KryoUnitXA, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_43ln], + (instrs FDIVDrr, FDIVSrr)>; +def KryoWrite_1cyc_XA_Y_noRSV_121ln : + SchedWriteRes<[KryoUnitXA, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_121ln], + (instrs FDIVv2f32)>; +def KryoWrite_1cyc_XA_Y_XA_Y_123ln : + SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_1cyc_XA_Y_XA_Y_123ln], + (instrs FDIVv2f64, FDIVv4f32)>; +def KryoWrite_5cyc_X_noRSV_55ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 5; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_5cyc_X_noRSV_55ln], + (instregex "FN?M(ADD|SUB)Srrr")>; +def KryoWrite_6cyc_X_noRSV_57ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 6; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_6cyc_X_noRSV_57ln], + (instregex "FN?M(ADD|SUB)Drrr")>; +def KryoWrite_5cyc_X_noRSV_51ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 5; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_5cyc_X_noRSV_51ln], + (instrs FMLAv2f32, FMLSv2f32, FMLAv1i32_indexed, FMLSv1i32_indexed)>; +def KryoWrite_5cyc_X_X_56ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 5; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_5cyc_X_X_56ln], + (instrs FMLAv4f32, FMLSv4f32)>; +def KryoWrite_6cyc_X_X_61ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 6; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_6cyc_X_X_61ln], + (instrs FMLAv2f64, FMLSv2f64)>; +def KryoWrite_5cyc_X_noRSV_128ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 5; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_5cyc_X_noRSV_128ln], + (instrs FMLAv2i32_indexed, FMLSv2i32_indexed)>; +def KryoWrite_5cyc_X_X_131ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 5; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_5cyc_X_X_131ln], + (instrs FMLAv4i32_indexed, FMLSv4i32_indexed)>; +def KryoWrite_6cyc_X_X_134ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 6; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_6cyc_X_X_134ln], + (instrs FMLAv2i64_indexed, FMLSv2i64_indexed)>; +def KryoWrite_6cyc_X_noRSV_60ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 6; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_6cyc_X_noRSV_60ln], + (instrs FMLAv1i64_indexed, FMLSv1i64_indexed, FMULv1i64_indexed, FMULXv1i64_indexed)>; +def KryoWrite_1cyc_XY_45ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_XY_45ln], + (instregex "FMOV(XDHigh|DXHigh|DX)r")>; +def KryoWrite_1cyc_XY_noRSV_47ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_47ln], + (instregex "FMOV(Di|Dr|Si|Sr|SWr|WSr|XDr|v.*_ns)")>; +def KryoWrite_5cyc_X_noRSV_53ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 5; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_5cyc_X_noRSV_53ln], + (instrs FMULv1i32_indexed, FMULXv1i32_indexed)>; +def KryoWrite_5cyc_X_noRSV_127ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 5; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_5cyc_X_noRSV_127ln], + (instrs FMULv2f32, FMULXv2f32, FMULv2i32_indexed, FMULXv2i32_indexed)>; +def KryoWrite_5cyc_X_X_130ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 5; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_5cyc_X_X_130ln], + (instrs FMULv4f32, FMULXv4f32, FMULv4i32_indexed, FMULXv4i32_indexed)>; +def KryoWrite_6cyc_X_X_133ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 6; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_6cyc_X_X_133ln], + (instrs FMULv2f64, FMULXv2f64, FMULv2i64_indexed, FMULXv2i64_indexed)>; +def KryoWrite_5cyc_X_noRSV_54ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 5; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_5cyc_X_noRSV_54ln], + (instrs FMULSrr, FNMULSrr, FMULX32)>; +def KryoWrite_6cyc_X_noRSV_59ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 6; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_6cyc_X_noRSV_59ln], + (instrs FMULDrr, FNMULDrr, FMULX64)>; +def KryoWrite_3cyc_XY_noRSV_28ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_28ln], + (instrs FRECPEv1i32, FRECPEv1i64, FRSQRTEv1i32, FRSQRTEv1i64 )>; +def KryoWrite_3cyc_XY_noRSV_99ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_99ln], + (instrs FRECPEv2f32, FRSQRTEv2f32)>; +def KryoWrite_3cyc_XY_XY_102ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_102ln], + (instrs FRECPEv2f64, FRECPEv4f32, FRSQRTEv2f64, FRSQRTEv4f32)>; +def KryoWrite_5cyc_X_noRSV_52ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 5; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_5cyc_X_noRSV_52ln], + (instrs FRECPS32, FRSQRTS32)>; +def KryoWrite_6cyc_X_noRSV_58ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 6; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_6cyc_X_noRSV_58ln], + (instrs FRECPS64, FRSQRTS64)>; +def KryoWrite_5cyc_X_noRSV_126ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 5; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_5cyc_X_noRSV_126ln], + (instrs FRECPSv2f32, FRSQRTSv2f32)>; +def KryoWrite_5cyc_X_X_129ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 5; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_5cyc_X_X_129ln], + (instrs FRECPSv4f32, FRSQRTSv4f32)>; +def KryoWrite_6cyc_X_X_132ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 6; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_6cyc_X_X_132ln], + (instrs FRECPSv2f64, FRSQRTSv2f64)>; +def KryoWrite_3cyc_XY_noRSV_50ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_50ln], + (instrs FRECPXv1i32, FRECPXv1i64)>; +def KryoWrite_2cyc_XY_noRSV_39ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_39ln], + (instregex "FRINT(A|I|M|N|P|X|Z)(S|D)r")>; +def KryoWrite_2cyc_XY_noRSV_108ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_108ln], + (instregex "FRINT(A|I|M|N|P|X|Z)v2f32")>; +def KryoWrite_2cyc_XY_XY_109ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_109ln], + (instregex "FRINT(A|I|M|N|P|X|Z)(v2f64|v4f32)")>; +def KryoWrite_1cyc_XA_Y_noRSV_42ln : + SchedWriteRes<[KryoUnitXA, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_42ln], + (instregex "FSQRT(S|D)r")>; +def KryoWrite_1cyc_XA_Y_noRSV_120ln : + SchedWriteRes<[KryoUnitXA, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_120ln], + (instregex "FSQRTv2f32")>; +def KryoWrite_1cyc_XA_Y_XA_Y_122ln : + SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_1cyc_XA_Y_XA_Y_122ln], + (instregex "FSQRT(v2f64|v4f32)")>; +def KryoWrite_1cyc_X_201ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_X_201ln], + (instregex "INSv.*")>; +def KryoWrite_3cyc_LS_255ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_3cyc_LS_255ln], + (instregex "LD1(One(v16b|v8h|v4s|v2d)|i64)$")>; +def KryoWrite_4cyc_LS_X_270ln : + SchedWriteRes<[KryoUnitLS, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_LS_X_270ln], + (instregex "LD1(i8|i16|i32)$")>; +def KryoWrite_3cyc_LS_noRSV_285ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_noRSV_285ln], + (instregex "LD1One(v8b|v4h|v2s|v1d)$")>; +def KryoWrite_3cyc_LS_XY_289ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_289ln, WriteAdr], + (instregex "LD1(One(v16b|v8h|v4s|v2d)|i64)_POST$")>; +def KryoWrite_4cyc_LS_XY_X_298ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_4cyc_LS_XY_X_298ln, WriteAdr], + (instregex "LD1(i8|i16|i32)_POST$")>; +def KryoWrite_3cyc_LS_LS_LS_308ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_LS_308ln], + (instregex "LD1Three(v16b|v8h|v4s|v2d)$")>; +def KryoWrite_3cyc_LS_XY_noRSV_317ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_317ln, WriteAdr], + (instregex "LD1One(v8b|v4h|v2s|v1d)_POST$")>; +def KryoWrite_3cyc_LS_LS_LS_LS_328ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_LS_LS_328ln, WriteAdr], + (instregex "LD1Four(v16b|v8h|v4s|v2d)_POST$")>; +def KryoWrite_3cyc_LS_XY_LS_LS_332ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_332ln, WriteAdr], + (instregex "LD1Three(v16b|v8h|v4s|v2d)_POST$")>; +def KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_348ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 5; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_348ln], + (instregex "LD1Three(v8b|v4h|v2s|v1d)$")>; +def KryoWrite_3cyc_LS_XY_LS_LS_LS_351ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 5; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_LS_351ln], + (instregex "LD1Four(v16b|v8h|v4s|v2d)$")>; +def KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_noRSV_358ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 6; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_noRSV_358ln], + (instregex "LD1Four(v8b|v4h|v2s|v1d)$")>; +def KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_360ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 6; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_360ln, WriteAdr], + (instregex "LD1Three(v8b|v4h|v2s|v1d)_POST$")>; +def KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_noRSV_368ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 7; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_noRSV_368ln, WriteAdr], + (instregex "LD1Four(v8b|v4h|v2s|v1d)_POST$")>; +def KryoWrite_3cyc_LS_LS_281ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_281ln], + (instregex "LD(1|2)Two(v16b|v8h|v4s|v2d)$")>; +def KryoWrite_3cyc_LS_noRSV_noRSV_311ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_noRSV_noRSV_311ln], + (instregex "LD(1|2)Two(v8b|v4h|v2s|v1d)$")>; +def KryoWrite_3cyc_LS_XY_LS_313ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_313ln, WriteAdr], + (instregex "LD(1|2)Two(v16b|v8h|v4s|v2d)_POST$")>; +def KryoWrite_3cyc_LS_XY_noRSV_noRSV_334ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_noRSV_334ln, WriteAdr], + (instregex "LD(1|2)Two(v8b|v4h|v2s|v1d)_POST$")>; +def KryoWrite_3cyc_LS_256ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_3cyc_LS_256ln], + (instregex "LD1R(v16b|v8h|v4s|v2d)$")>; +def KryoWrite_3cyc_LS_noRSV_286ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_noRSV_286ln], + (instregex "LD1R(v8b|v4h|v2s|v1d)$")>; +def KryoWrite_3cyc_LS_XY_290ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_290ln, WriteAdr], + (instregex "LD1R(v16b|v8h|v4s|v2d)_POST$")>; +def KryoWrite_3cyc_LS_XY_noRSV_318ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_318ln, WriteAdr], + (instregex "LD1R(v8b|v4h|v2s|v1d)_POST$")>; +def KryoWrite_3cyc_LS_257ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_3cyc_LS_257ln], + (instregex "LD2i64$")>; +def KryoWrite_3cyc_LS_XY_291ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_291ln, WriteAdr], + (instregex "LD2i64_POST$")>; +def KryoWrite_4cyc_LS_X_X_296ln : + SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_4cyc_LS_X_X_296ln], + (instregex "LD2(i8|i16|i32)$")>; +def KryoWrite_4cyc_LS_XY_X_X_321ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_4cyc_LS_XY_X_X_321ln, WriteAdr], + (instregex "LD2(i8|i16|i32)_POST$")>; +def KryoWrite_3cyc_LS_LS_282ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_282ln], + (instregex "LD2R(v16b|v8h|v4s|v2d)$")>; +def KryoWrite_3cyc_LS_noRSV_noRSV_312ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_noRSV_noRSV_312ln], + (instregex "LD2R(v8b|v4h|v2s|v1d)$")>; +def KryoWrite_3cyc_LS_XY_LS_314ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_314ln, WriteAdr], + (instregex "LD2R(v16b|v8h|v4s|v2d)_POST$")>; +def KryoWrite_3cyc_LS_XY_noRSV_noRSV_335ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_noRSV_335ln, WriteAdr], + (instregex "LD2R(v8b|v4h|v2s|v1d)_POST$")>; +def KryoWrite_3cyc_LS_LS_283ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_283ln], + (instregex "LD3i64$")>; +def KryoWrite_3cyc_LS_LS_LS_309ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_LS_309ln], + (instregex "LD3Threev2d$")>; +def KryoWrite_3cyc_LS_XY_LS_315ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_315ln, WriteAdr], + (instregex "LD3i64_POST$")>; +def KryoWrite_4cyc_LS_X_X_X_320ln : + SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_4cyc_LS_X_X_X_320ln], + (instregex "LD3(i8|i16|i32)$")>; +def KryoWrite_3cyc_LS_XY_LS_LS_331ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_331ln, WriteAdr], + (instregex "LD3Threev2d_POST$")>; +def KryoWrite_4cyc_LS_XY_X_X_X_338ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitX, KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 5; +} +def : InstRW<[KryoWrite_4cyc_LS_XY_X_X_X_338ln, WriteAdr], + (instregex "LD3(i8|i16|i32)_POST$")>; +def KryoWrite_4cyc_LS_LS_X_X_X_noRSV_noRSV_noRSV_373ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 8; +} +def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_noRSV_noRSV_noRSV_373ln], + (instregex "LD3Three(v8b|v4h|v2s)$")>; +def KryoWrite_4cyc_LS_XY_LS_X_X_X_noRSV_noRSV_noRSV_380ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitX, KryoUnitX, + KryoUnitX]> { + let Latency = 4; let NumMicroOps = 9; +} +def : InstRW<[KryoWrite_4cyc_LS_XY_LS_X_X_X_noRSV_noRSV_noRSV_380ln, WriteAdr], + (instregex "LD3Three(v8b|v4h|v2s)_POST$")>; +def KryoWrite_4cyc_LS_LS_X_X_X_LS_LS_X_X_X_381ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 10; +} +def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_LS_LS_X_X_X_381ln], + (instregex "LD3Three(v16b|v8h|v4s)$")>; +def KryoWrite_4cyc_LS_LS_X_X_X_LS_XY_LS_X_X_X_383ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitX, KryoUnitX, + KryoUnitX]> { + let Latency = 4; let NumMicroOps = 11; +} +def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_LS_XY_LS_X_X_X_383ln, WriteAdr], + (instregex "LD3Three(v16b|v8h|v4s)_POST$")>; +def KryoWrite_3cyc_LS_LS_LS_310ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_LS_310ln], + (instregex "LD3R(v16b|v8h|v4s|v2d)$")>; +def KryoWrite_3cyc_LS_XY_LS_LS_333ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_333ln, WriteAdr], + (instregex "LD3R(v16b|v8h|v4s|v2d)_POST$")>; +def KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_349ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 5; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_349ln], + (instregex "LD3R(v8b|v4h|v2s|v1d)$")>; +def KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_361ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 6; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_361ln, WriteAdr], + (instregex "LD3R(v8b|v4h|v2s|v1d)_POST$")>; +def KryoWrite_3cyc_LS_LS_284ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_284ln], + (instregex "LD4i64$")>; +def KryoWrite_3cyc_LS_XY_LS_316ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_316ln, WriteAdr], + (instregex "LD4i64_POST$")>; +def KryoWrite_3cyc_LS_LS_LS_LS_329ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_LS_LS_329ln], + (instregex "LD4Four(v2d)$")>; +def KryoWrite_4cyc_LS_X_X_X_X_337ln : + SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 5; +} +def : InstRW<[KryoWrite_4cyc_LS_X_X_X_X_337ln], + (instregex "LD4(i8|i16|i32)$")>; +def KryoWrite_3cyc_LS_XY_LS_LS_LS_350ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 5; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_LS_350ln, WriteAdr], + (instregex "LD4Four(v2d)_POST$")>; +def KryoWrite_4cyc_LS_XY_X_X_X_X_355ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX]> { + let Latency = 4; let NumMicroOps = 6; +} +def : InstRW<[KryoWrite_4cyc_LS_XY_X_X_X_X_355ln, WriteAdr], + (instregex "LD4(i8|i16|i32)_POST$")>; +def KryoWrite_4cyc_LS_LS_X_X_X_X_noRSV_noRSV_noRSV_noRSV_382ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX]> { + let Latency = 4; let NumMicroOps = 10; +} +def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_X_noRSV_noRSV_noRSV_noRSV_382ln], + (instregex "LD4Four(v8b|v4h|v2s)$")>; +def KryoWrite_4cyc_LS_XY_LS_X_X_X_X_noRSV_noRSV_noRSV_noRSV_384ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitX, KryoUnitX, + KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 11; +} +def : InstRW<[KryoWrite_4cyc_LS_XY_LS_X_X_X_X_noRSV_noRSV_noRSV_noRSV_384ln, WriteAdr], + (instregex "LD4Four(v8b|v4h|v2s)_POST$")>; +def KryoWrite_4cyc_LS_LS_X_X_X_X_LS_LS_X_X_X_X_386ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX, KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, + KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 12; +} +def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_X_LS_LS_X_X_X_X_386ln], + (instregex "LD4Four(v16b|v8h|v4s)$")>; +def KryoWrite_4cyc_LS_LS_X_X_X_X_LS_XY_LS_X_X_X_X_389ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX, KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitX, + KryoUnitX, KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 13; +} +def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_X_LS_XY_LS_X_X_X_X_389ln, WriteAdr], + (instregex "LD4Four(v16b|v8h|v4s)_POST$")>; +def KryoWrite_3cyc_LS_LS_LS_LS_330ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_LS_LS_330ln], + (instregex "LD4R(v16b|v8h|v4s|v2d)$")>; +def KryoWrite_3cyc_LS_XY_LS_LS_LS_352ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 5; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_LS_352ln, WriteAdr], + (instregex "LD4R(v16b|v8h|v4s|v2d)_POST$")>; +def KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_noRSV_359ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 6; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_noRSV_359ln], + (instregex "LD4R(v8b|v4h|v2s|v1d)$")>; +def KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_noRSV_369ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 7; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_noRSV_369ln, WriteAdr], + (instregex "LD4R(v8b|v4h|v2s|v1d)_POST$")>; +def KryoWrite_3cyc_LS_LS_400ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_400ln], + (instregex "(LDAX?R(B|H|W|X)|LDAXP(W|X))")>; +def KryoWrite_3cyc_LS_LS_401ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_401ln, WriteLDHi], + (instrs LDNPQi)>; +def KryoWrite_3cyc_LS_noRSV_noRSV_408ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_noRSV_noRSV_408ln, WriteLDHi], + (instrs LDNPDi, LDNPSi)>; +def KryoWrite_3cyc_LS_394ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_3cyc_LS_394ln, WriteLDHi], + (instrs LDNPWi, LDNPXi)>; +def KryoWrite_3cyc_LS_LS_402ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_402ln, WriteLDHi], + (instrs LDPQi)>; +def KryoWrite_3cyc_LS_noRSV_noRSV_409ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_noRSV_noRSV_409ln, WriteLDHi], + (instrs LDPDi, LDPSi)>; +def KryoWrite_3cyc_LS_XY_LS_410ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_LS_410ln, WriteLDHi, WriteAdr], + (instregex "LDPQ(post|pre)")>; +def KryoWrite_3cyc_LS_XY_noRSV_noRSV_411ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_noRSV_411ln, WriteLDHi, WriteAdr], + (instregex "LDP(D|S)(post|pre)")>; +def KryoWrite_3cyc_LS_393ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_3cyc_LS_393ln, WriteLDHi], + (instrs LDPWi, LDPXi)>; +def KryoWrite_3cyc_LS_XY_403ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_403ln, WriteLDHi, WriteAdr], + (instregex "LDP(W|X)(post|pre)")>; +def KryoWrite_4cyc_LS_395ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 4; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_4cyc_LS_395ln, WriteLDHi], + (instrs LDPSWi)>; +def KryoWrite_4cyc_LS_XY_405ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_LS_XY_405ln, WriteLDHi, WriteAdr], + (instrs LDPSWpost, LDPSWpre)>; +def KryoWrite_3cyc_LS_264ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_3cyc_LS_264ln], + (instrs LDRQui, LDRQl)>; +def KryoWrite_4cyc_X_LS_271ln : + SchedWriteRes<[KryoUnitX, KryoUnitLS]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_LS_271ln], + (instrs LDRQroW, LDRQroX)>; +def KryoWrite_3cyc_LS_noRSV_287ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_noRSV_287ln], + (instregex "LDR((D|S)l|(D|S|H|B)ui)")>; +def KryoWrite_3cyc_LS_XY_293ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_293ln, WriteAdr], + (instrs LDRQpost, LDRQpre)>; +def KryoWrite_4cyc_X_LS_noRSV_297ln : + SchedWriteRes<[KryoUnitX, KryoUnitLS]> { + let Latency = 4; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_4cyc_X_LS_noRSV_297ln], + (instregex "LDR(D|S|H|B)ro(W|X)")>; +def KryoWrite_3cyc_LS_XY_noRSV_319ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_319ln, WriteAdr], + (instregex "LDR(D|S|H|B)(post|pre)")>; +def KryoWrite_3cyc_LS_261ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_3cyc_LS_261ln], + (instregex "LDR(BB|HH|W|X)ui")>; +def KryoWrite_3cyc_LS_XY_292ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_XY_292ln, WriteAdr], + (instregex "LDR(BB|HH|W|X)(post|pre)")>; +def KryoWrite_4cyc_X_LS_272ln : + SchedWriteRes<[KryoUnitX, KryoUnitLS]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_LS_272ln], + (instregex "(LDR(BB|HH|W|X)ro(W|X)|PRFMro(W|X))")>; +def KryoWrite_3cyc_LS_262ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_3cyc_LS_262ln], + (instrs LDRWl, LDRXl)>; +def KryoWrite_4cyc_LS_268ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 4; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_4cyc_LS_268ln], + (instregex "LDRS(BW|BX|HW|HX|W)ui")>; +def KryoWrite_5cyc_X_LS_273ln : + SchedWriteRes<[KryoUnitX, KryoUnitLS]> { + let Latency = 5; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_5cyc_X_LS_273ln], + (instregex "LDRS(BW|BX|HW|HX|W)ro(W|X)")>; +def KryoWrite_4cyc_LS_XY_294ln : + SchedWriteRes<[KryoUnitLS, KryoUnitXY]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_LS_XY_294ln, WriteAdr], + (instregex "LDRS(BW|BX|HW|HX|W)(post|pre)")>; +def KryoWrite_4cyc_LS_269ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 4; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_4cyc_LS_269ln], + (instrs LDRSWl)>; +def KryoWrite_3cyc_LS_260ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_3cyc_LS_260ln], + (instregex "LDTR(B|H|W|X)i")>; +def KryoWrite_4cyc_LS_267ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 4; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_4cyc_LS_267ln], + (instregex "LDTRS(BW|BX|HW|HX|W)i")>; +def KryoWrite_3cyc_LS_263ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_3cyc_LS_263ln], + (instrs LDURQi)>; +def KryoWrite_3cyc_LS_noRSV_288ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_noRSV_288ln], + (instregex "LDUR(D|S|H|B)i")>; +def KryoWrite_3cyc_LS_259ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_3cyc_LS_259ln], + (instregex "LDUR(BB|HH|W|X)i")>; +def KryoWrite_4cyc_LS_266ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 4; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_4cyc_LS_266ln], + (instregex "LDURS(B|H)?(W|X)i")>; +def KryoWrite_3cyc_LS_258ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_3cyc_LS_258ln], + (instregex "LDXP(W|X)")>; +def KryoWrite_3cyc_LS_258_1ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 3; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_3cyc_LS_258_1ln], + (instregex "LDXR(B|H|W|X)")>; +def KryoWrite_2cyc_XY_XY_137ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_137ln], + (instrs LSLVWr, LSLVXr)>; +def KryoWrite_1cyc_XY_135ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_XY_135ln], + (instregex "(LS|AS|RO)RV(W|X)r")>; +def KryoWrite_4cyc_X_84ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_4cyc_X_84ln], + (instrs MADDWrrr, MSUBWrrr)>; +def KryoWrite_5cyc_X_85ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 5; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_5cyc_X_85ln], + (instrs MADDXrrr, MSUBXrrr)>; +def KryoWrite_4cyc_X_noRSV_188ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_noRSV_188ln], + (instregex "(MLA|MLS|MUL)(v8i8|v4i16|v2i32)(_indexed)?")>; +def KryoWrite_4cyc_X_X_192ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_X_192ln], + (instregex "(MLA|MLS|MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?")>; +def KryoWrite_1cyc_XY_noRSV_198ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_198ln], + (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i16|v2s_msl)")>; +def KryoWrite_1cyc_XY_XY_199ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_XY_199ln], + (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)")>; +def KryoWrite_1cyc_X_89ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_X_89ln], + (instrs MOVKWi, MOVKXi)>; +def KryoWrite_1cyc_XY_91ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_XY_91ln], + (instrs MOVNWi, MOVNXi)>; +def KryoWrite_1cyc_XY_90ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_XY_90ln], + (instrs MOVZWi, MOVZXi)>; +def KryoWrite_2cyc_XY_93ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_2cyc_XY_93ln], + (instrs MRS)>; +def KryoWrite_0cyc_X_87ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 0; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_0cyc_X_87ln], + (instrs MSRpstateImm4)>; +def : InstRW<[KryoWrite_0cyc_X_87ln], + (instrs MSRpstateImm1)>; +def KryoWrite_0cyc_XY_88ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 0; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_0cyc_XY_88ln], + (instrs MSR)>; +def KryoWrite_1cyc_XY_noRSV_143ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_143ln], + (instregex "NEG(v8i8|v4i16|v2i32|v1i64)")>; +def KryoWrite_1cyc_XY_XY_145ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_XY_145ln], + (instregex "NEG(v16i8|v8i16|v4i32|v2i64)")>; +def KryoWrite_1cyc_XY_noRSV_193ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_193ln], + (instrs NOTv8i8)>; +def KryoWrite_1cyc_XY_XY_194ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_XY_194ln], + (instrs NOTv16i8)>; +def KryoWrite_2cyc_XY_noRSV_234ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_234ln], + (instrs PMULv8i8)>; +def KryoWrite_2cyc_XY_XY_236ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_236ln], + (instrs PMULv16i8)>; +def KryoWrite_2cyc_XY_XY_235ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_235ln], + (instrs PMULLv8i8, PMULLv16i8)>; +def KryoWrite_3cyc_XY_XY_237ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_237ln], + (instrs PMULLv1i64, PMULLv2i64)>; +def KryoWrite_0cyc_LS_254ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 0; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_0cyc_LS_254ln], + (instrs PRFMl, PRFMui)>; +def KryoWrite_0cyc_LS_253ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 0; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_0cyc_LS_253ln], + (instrs PRFUMi)>; +def KryoWrite_6cyc_XY_X_noRSV_175ln : + SchedWriteRes<[KryoUnitXY, KryoUnitX]> { + let Latency = 6; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_6cyc_XY_X_noRSV_175ln], + (instregex "R(ADD|SUB)HNv.*")>; +def KryoWrite_2cyc_XY_204ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_2cyc_XY_204ln], + (instrs RBITWr, RBITXr)>; +def KryoWrite_2cyc_XY_noRSV_218ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_noRSV_218ln], + (instrs RBITv8i8)>; +def KryoWrite_2cyc_XY_XY_219ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_219ln], + (instrs RBITv16i8)>; +def KryoWrite_1cyc_X_202ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_X_202ln], + (instregex "REV(16|32)?(W|X)r")>; +def KryoWrite_1cyc_XY_noRSV_214ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_214ln], + (instregex "REV(16|32|64)(v8i8|v4i16|v2i32)")>; +def KryoWrite_1cyc_XY_XY_216ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_XY_216ln], + (instregex "REV(16|32|64)(v16i8|v8i16|v4i32)")>; +def KryoWrite_3cyc_X_noRSV_244ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_X_noRSV_244ln], + (instregex "S(L|R)I(d|(v8i8|v4i16|v2i32)_shift)")>; +def KryoWrite_3cyc_X_X_245ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_X_X_245ln], + (instregex "S(L|R)I(v16i8|v8i16|v4i32|v2i64)_shift")>; +def KryoWrite_1cyc_XY_2ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_XY_2ln, ReadI, ReadI], + (instregex "SBCS?(W|X)r")>; +def KryoWrite_2cyc_XA_XA_XA_24ln : + SchedWriteRes<[KryoUnitXA, KryoUnitXA, KryoUnitXA]> { + let Latency = 2; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_2cyc_XA_XA_XA_24ln], + (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr)>; +def KryoWrite_1cyc_XY_noRSV_21ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_21ln], + (instrs SHA1Hrr)>; +def KryoWrite_2cyc_X_X_23ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_X_X_23ln], + (instrs SHA1SU0rrr, SHA1SU1rr, SHA256SU0rr)>; +def KryoWrite_4cyc_XA_XA_XA_25ln : + SchedWriteRes<[KryoUnitXA, KryoUnitXA, KryoUnitXA]> { + let Latency = 4; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_4cyc_XA_XA_XA_25ln], + (instrs SHA256Hrrr, SHA256H2rrr)>; +def KryoWrite_3cyc_XY_XY_X_X_26ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitX, KryoUnitX]> { + let Latency = 3; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_X_X_26ln], + (instrs SHA256SU1rrr)>; +def KryoWrite_4cyc_X_noRSV_189ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_noRSV_189ln], + (instregex "SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?")>; +def KryoWrite_3cyc_XY_noRSV_68ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_68ln], + (instregex "SQ(ABS|NEG)(v1i8|v1i16|v1i32|v1i64)")>; +def KryoWrite_3cyc_XY_noRSV_157ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_157ln], + (instregex "SQ(ABS|NEG)(v8i8|v4i16|v2i32)")>; +def KryoWrite_3cyc_XY_XY_164ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_164ln], + (instregex "SQ(ABS|NEG)(v16i8|v8i16|v4i32|v2i64)")>; +def KryoWrite_4cyc_X_noRSV_190ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 4; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_4cyc_X_noRSV_190ln], + (instregex "SQD(MLAL|MLSL|MULL)(i16|i32)")>; +def KryoWrite_0cyc_LS_Y_274ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_274ln], + (instregex "ST1(One(v8b|v4h|v2s|v1d|v16b|v8h|v4s|v2d)|(i8|i16|i32|i64)|Two(v8b|v4h|v2s|v1d))$")>; +def KryoWrite_1cyc_LS_Y_X_301ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitX]> { + let Latency = 1; let NumMicroOps = 3; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_X_301ln], + (instregex "ST1(One(v8b|v4h|v2s|v1d|v16b|v8h|v4s|v2d)|(i8|i16|i32|i64)|Two(v8b|v4h|v2s|v1d))_POST$")>; +def KryoWrite_1cyc_LS_Y_XY_305ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 3; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_305ln], + (instregex "ST1(One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))_POST$")>; +def KryoWrite_0cyc_LS_Y_LS_Y_323ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 4; +} +def : InstRW<[WriteAdr, KryoWrite_0cyc_LS_Y_LS_Y_323ln], + (instregex "ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))_POST$")>; +def KryoWrite_1cyc_LS_Y_XY_LS_Y_345ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 5; +} +def : InstRW<[KryoWrite_1cyc_LS_Y_XY_LS_Y_345ln], + (instregex "ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>; +def KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_356ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitLS, + KryoUnitY]> { + let Latency = 0; let NumMicroOps = 6; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_356ln], + (instregex "ST1Three(v16b|v8h|v4s|v2d)$")>; +def KryoWrite_1cyc_LS_Y_XY_LS_Y_LS_Y_366ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY, + KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 7; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_LS_Y_366ln], + (instregex "ST1Three(v16b|v8h|v4s|v2d)_POST$")>; +def KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_LS_Y_371ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitLS, + KryoUnitY, KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 8; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_LS_Y_371ln], + (instregex "ST1Four(v16b|v8h|v4s|v2d)$")>; +def KryoWrite_0cyc_LS_Y_LS_Y_XY_LS_Y_LS_Y_377ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitXY, + KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 9; +} +def : InstRW<[WriteAdr, KryoWrite_0cyc_LS_Y_LS_Y_XY_LS_Y_LS_Y_377ln], + (instregex "ST1Four(v16b|v8h|v4s|v2d)_POST$")>; +def KryoWrite_0cyc_LS_Y_275ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_275ln], + (instregex "ST2(Two(v8b|v4h|v2s|v1d|v16b|v8h|v4s|v2d)|(i8|i16|i32|i64))$")>; +def KryoWrite_1cyc_LS_Y_XY_306ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 3; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_306ln], + (instregex "ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))_POST$")>; +def KryoWrite_0cyc_LS_Y_LS_Y_322ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_322ln], + (instregex "ST2Two(v16b|v8h|v4s|v2d)$")>; +def KryoWrite_1cyc_LS_Y_XY_LS_Y_344ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 5; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_344ln], + (instregex "ST2Two(v16b|v8h|v4s|v2d)_POST$")>; +def KryoWrite_0cyc_LS_Y_LS_Y_324ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_324ln], + (instregex "ST3(Threev1d|(i8|i16|i32|i64))$")>; +def KryoWrite_1cyc_LS_Y_XY_LS_Y_346ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 5; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_346ln], + (instregex "ST3(Threev1d|(i8|i16|i32|i64))_POST$")>; +def KryoWrite_1cyc_X_X_LS_Y_LS_Y_353ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitLS, + KryoUnitY]> { + let Latency = 1; let NumMicroOps = 6; +} +def : InstRW<[KryoWrite_1cyc_X_X_LS_Y_LS_Y_353ln], + (instregex "ST3Three(v8b|v4h|v2s)$")>; +def KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_357ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitLS, + KryoUnitY]> { + let Latency = 0; let NumMicroOps = 6; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_357ln], + (instregex "ST3Threev2d$")>; +def KryoWrite_1cyc_X_X_LS_Y_XY_LS_Y_363ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitXY, + KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 7; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_X_X_LS_Y_XY_LS_Y_363ln], + (instregex "ST3Three(v8b|v4h|v2s)_POST$")>; +def KryoWrite_1cyc_LS_Y_XY_LS_Y_LS_Y_367ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY, + KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 7; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_LS_Y_367ln], + (instregex "ST3Threev2d_POST$")>; +def KryoWrite_1cyc_X_X_LS_Y_LS_Y_X_X_LS_Y_LS_Y_385ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitLS, + KryoUnitY, KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, + KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 12; +} +def : InstRW<[KryoWrite_1cyc_X_X_LS_Y_LS_Y_X_X_LS_Y_LS_Y_385ln], + (instregex "ST3Three(v16b|v8h|v4s)$")>; +def KryoWrite_1cyc_X_X_LS_Y_LS_Y_X_X_LS_Y_XY_LS_Y_388ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitLS, + KryoUnitY, KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, + KryoUnitXY, KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 13; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_X_X_LS_Y_LS_Y_X_X_LS_Y_XY_LS_Y_388ln], + (instregex "ST3Three(v16b|v8h|v4s)_POST$")>; +def KryoWrite_0cyc_LS_Y_LS_Y_325ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_325ln], + (instregex "ST4(Fourv1d|(i8|i16|i32|i64))$")>; +def KryoWrite_1cyc_LS_Y_XY_LS_Y_347ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 5; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_347ln], + (instregex "ST4(Fourv1d|(i8|i16|i32|i64))_POST$")>; +def KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_370ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX, + KryoUnitX, KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 8; +} +def : InstRW<[KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_370ln], + (instregex "ST4Four(v8b|v4h|v2s)$")>; +def KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_LS_Y_372ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitLS, + KryoUnitY, KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 8; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_LS_Y_372ln], + (instregex "ST4Fourv2d$")>; +def KryoWrite_1cyc_X_X_LS_Y_XY_X_X_LS_Y_375ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitXY, + KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 9; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_X_X_LS_Y_XY_X_X_LS_Y_375ln], + (instregex "ST4Four(v8b|v4h|v2s)_POST$")>; +def KryoWrite_0cyc_LS_Y_LS_Y_XY_LS_Y_LS_Y_379ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitXY, + KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 9; +} +def : InstRW<[WriteAdr, KryoWrite_0cyc_LS_Y_LS_Y_XY_LS_Y_LS_Y_379ln], + (instregex "ST4Fourv2d_POST$")>; +def KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_390ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX, + KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX, KryoUnitX, + KryoUnitLS, KryoUnitY, KryoUnitX, KryoUnitX, KryoUnitLS, + KryoUnitY]> { + let Latency = 1; let NumMicroOps = 16; +} +def : InstRW<[KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_390ln], + (instregex "ST4Four(v16b|v8h|v4s)$")>; +def KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_XY_X_X_LS_Y_392ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX, + KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX, KryoUnitX, + KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitX, KryoUnitX, + KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 17; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_XY_X_X_LS_Y_392ln], + (instregex "ST4Four(v16b|v8h|v4s)_POST$")>; +def KryoWrite_0cyc_LS_LS_Y_299ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_0cyc_LS_LS_Y_299ln], + (instregex "STLR(B|H|W|X)")>; +def KryoWrite_3cyc_LS_LS_Y_307ln : + SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitY]> { + let Latency = 3; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_3cyc_LS_LS_Y_307ln], + (instregex "STLX(P(W|X)|R(B|H|W|X))")>; +def KryoWrite_0cyc_LS_Y_276ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_276ln], + (instrs STNPDi, STNPSi)>; +def KryoWrite_0cyc_LS_Y_LS_Y_326ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_326ln], + (instrs STNPQi)>; +def KryoWrite_0cyc_LS_Y_280ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_280ln], + (instrs STNPWi, STNPXi)>; +def KryoWrite_0cyc_LS_Y_277ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_277ln], + (instregex "STP(D|S)i")>; +def KryoWrite_1cyc_LS_Y_X_303ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitX]> { + let Latency = 1; let NumMicroOps = 3; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_X_303ln], + (instregex "STP(D|S)(post|pre)")>; +def KryoWrite_0cyc_LS_Y_LS_Y_327ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_327ln], + (instrs STPQi)>; +def KryoWrite_1cyc_LS_Y_X_LS_Y_343ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitX, KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 5; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_X_LS_Y_343ln], + (instrs STPQpost, STPQpre)>; +def KryoWrite_0cyc_LS_Y_279ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_279ln], + (instregex "STP(W|X)i")>; +def KryoWrite_1cyc_LS_X_Y_300ln : + SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 3; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_X_Y_300ln], + (instregex "STP(W|X)(post|pre)")>; +def KryoWrite_0cyc_LS_Y_278ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_278ln], + (instregex "STR(Q|D|S|H|B)ui")>; +def KryoWrite_1cyc_X_LS_Y_295ln : + SchedWriteRes<[KryoUnitX, KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_1cyc_X_LS_Y_295ln], + (instregex "STR(D|S|H|B)ro(W|X)")>; +def KryoWrite_1cyc_LS_Y_X_304ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitX]> { + let Latency = 1; let NumMicroOps = 3; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_X_304ln], + (instregex "STR(Q|D|S|H|B)(post|pre)")>; +def KryoWrite_2cyc_X_LS_Y_XY_LS_Y_354ln : + SchedWriteRes<[KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, + KryoUnitY]> { + let Latency = 2; let NumMicroOps = 6; +} +def : InstRW<[KryoWrite_2cyc_X_LS_Y_XY_LS_Y_354ln], + (instregex "STRQro(W|X)")>; +def KryoWrite_0cyc_LS_Y_399ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_399ln], + (instregex "STR(BB|HH|W|X)ui")>; +def KryoWrite_1cyc_X_LS_Y_406ln : + SchedWriteRes<[KryoUnitX, KryoUnitLS, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_1cyc_X_LS_Y_406ln], + (instregex "STR(BB|HH|W|X)ro(W|X)")>; +def KryoWrite_1cyc_LS_X_Y_407ln : + SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitY]> { + let Latency = 1; let NumMicroOps = 3; +} +def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_X_Y_407ln], + (instregex "STR(BB|HH|W|X)(post|pre)")>; +def KryoWrite_0cyc_LS_Y_398ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_398ln], + (instregex "STTR(B|H|W|X)i")>; +def KryoWrite_0cyc_LS_Y_396ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_396ln], + (instregex "STUR(Q|D|S|H|B)i")>; +def KryoWrite_0cyc_LS_Y_397ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY]> { + let Latency = 0; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_0cyc_LS_Y_397ln], + (instregex "STUR(BB|HH|W|X)i")>; +def KryoWrite_3cyc_LS_Y_404ln : + SchedWriteRes<[KryoUnitLS, KryoUnitY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_LS_Y_404ln], + (instregex "STX(P(W|X)|R(B|H|W|X))")>; +def KryoWrite_3cyc_XY_noRSV_160ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_160ln], + (instregex "^(SU|US)QADD(v8i8|v4i16|v2i32)")>; +def KryoWrite_3cyc_XY_XY_167ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_167ln], + (instregex "^(SU|US)QADD(v16i8|v8i16|v4i32|v2i64)")>; +def KryoWrite_1cyc_XY_1ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_1cyc_XY_1ln, ReadI], + (instregex "SUBS?(W|X)ri")>; +def KryoWrite_2cyc_XY_XY_5ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_5ln, ReadI, ReadIEReg], + (instregex "SUBS?(W|X)rx")>; +def KryoWrite_2cyc_XY_XY_5_1ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 2; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_2cyc_XY_XY_5_1ln, ReadI, ReadISReg], + (instregex "SUBS?(W|X)rs")>; +def KryoWrite_1cyc_XY_noRSV_6ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_6ln, ReadI, ReadI], + (instregex "SUBS?(W|X)rr")>; +def KryoWrite_0cyc_LS_9ln : + SchedWriteRes<[KryoUnitLS]> { + let Latency = 0; let NumMicroOps = 1; +} +def : InstRW<[KryoWrite_0cyc_LS_9ln], + (instregex "SYSL?xt")>; +def KryoWrite_1cyc_X_noRSV_205ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_X_noRSV_205ln], + (instrs TBLv8i8One)>; +def KryoWrite_1cyc_X_X_208ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_X_X_208ln], + (instrs TBLv16i8One)>; +def KryoWrite_2cyc_X_X_X_noRSV_222ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX]> { + let Latency = 2; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_2cyc_X_X_X_noRSV_222ln], + (instrs TBLv8i8Two)>; +def KryoWrite_2cyc_X_X_X_X_X_X_224ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX]> { + let Latency = 2; let NumMicroOps = 6; +} +def : InstRW<[KryoWrite_2cyc_X_X_X_X_X_X_224ln], + (instrs TBLv16i8Two)>; +def KryoWrite_3cyc_X_X_X_X_X_noRSV_225ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> { + let Latency = 3; let NumMicroOps = 6; +} +def : InstRW<[KryoWrite_3cyc_X_X_X_X_X_noRSV_225ln], + (instrs TBLv8i8Three)>; +def KryoWrite_3cyc_X_X_X_X_X_X_X_noRSV_228ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX, KryoUnitX]> { + let Latency = 3; let NumMicroOps = 8; +} +def : InstRW<[KryoWrite_3cyc_X_X_X_X_X_X_X_noRSV_228ln], + (instrs TBLv8i8Four)>; +def KryoWrite_4cyc_X_X_X_X_X_X_X_X_XY_X_X_230ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitXY, KryoUnitX, + KryoUnitX]> { + let Latency = 4; let NumMicroOps = 11; +} +def : InstRW<[KryoWrite_4cyc_X_X_X_X_X_X_X_X_XY_X_X_230ln], + (instrs TBLv16i8Three)>; +def KryoWrite_4cyc_X_X_X_X_X_X_X_X_X_X_XY_X_X_X_X_232ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitXY, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 15; +} +def : InstRW<[KryoWrite_4cyc_X_X_X_X_X_X_X_X_X_X_XY_X_X_X_X_232ln], + (instrs TBLv16i8Four)>; +def KryoWrite_2cyc_X_X_noRSV_220ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 2; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_2cyc_X_X_noRSV_220ln], + (instrs TBXv8i8One)>; +def KryoWrite_2cyc_X_X_X_X_221ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> { + let Latency = 2; let NumMicroOps = 4; +} +def : InstRW<[KryoWrite_2cyc_X_X_X_X_221ln], + (instrs TBXv16i8One)>; +def KryoWrite_3cyc_X_X_X_X_noRSV_223ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> { + let Latency = 3; let NumMicroOps = 5; +} +def : InstRW<[KryoWrite_3cyc_X_X_X_X_noRSV_223ln], + (instrs TBXv8i8Two)>; +def KryoWrite_4cyc_X_X_X_X_X_X_noRSV_226ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX]> { + let Latency = 4; let NumMicroOps = 7; +} +def : InstRW<[KryoWrite_4cyc_X_X_X_X_X_X_noRSV_226ln], + (instrs TBXv8i8Three)>; +def KryoWrite_3cyc_X_X_X_X_X_X_X_X_227ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX, KryoUnitX, KryoUnitX]> { + let Latency = 3; let NumMicroOps = 8; +} +def : InstRW<[KryoWrite_3cyc_X_X_X_X_X_X_X_X_227ln], + (instrs TBXv16i8Two)>; +def KryoWrite_4cyc_X_X_X_X_X_X_X_X_noRSV_229ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX, KryoUnitX, KryoUnitX]> { + let Latency = 4; let NumMicroOps = 9; +} +def : InstRW<[KryoWrite_4cyc_X_X_X_X_X_X_X_X_noRSV_229ln], + (instrs TBXv8i8Four)>; +def KryoWrite_5cyc_X_X_X_X_X_X_X_X_X_XY_X_X_X_231ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitXY, + KryoUnitX, KryoUnitX, KryoUnitX]> { + let Latency = 5; let NumMicroOps = 13; +} +def : InstRW<[KryoWrite_5cyc_X_X_X_X_X_X_X_X_X_XY_X_X_X_231ln], + (instrs TBXv16i8Three)>; +def KryoWrite_5cyc_X_X_X_X_X_X_X_X_X_X_X_XY_X_X_X_X_X_233ln : + SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX, KryoUnitXY, KryoUnitX, KryoUnitX, KryoUnitX, + KryoUnitX, KryoUnitX]> { + let Latency = 5; let NumMicroOps = 17; +} +def : InstRW<[KryoWrite_5cyc_X_X_X_X_X_X_X_X_X_X_X_XY_X_X_X_X_X_233ln], + (instrs TBXv16i8Four)>; +def KryoWrite_1cyc_XY_XY_217ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_XY_217ln], + (instregex "((TRN1|TRN2|ZIP1|UZP1|UZP2)v2i64|ZIP2(v2i64|v4i32|v8i16|v16i8))")>; +def KryoWrite_1cyc_X_X_211ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_X_X_211ln], + (instregex "(TRN1|TRN2)(v4i32|v8i16|v16i8)")>; +def KryoWrite_1cyc_X_XY_213ln : + SchedWriteRes<[KryoUnitX, KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_X_XY_213ln], + (instregex "(TRN1|TRN2)(v2i32|v4i16|v8i8)")>; +def KryoWrite_3cyc_XY_noRSV_156ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_noRSV_156ln], + (instrs URECPEv2i32, URSQRTEv2i32)>; +def KryoWrite_3cyc_XY_XY_168ln : + SchedWriteRes<[KryoUnitXY, KryoUnitXY]> { + let Latency = 3; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_3cyc_XY_XY_168ln], + (instrs URECPEv4i32, URSQRTEv4i32)>; +def KryoWrite_1cyc_X_X_210ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_X_X_210ln], + (instregex "(UZP1|UZP2)(v4i32|v8i16|v16i8)")>; +def KryoWrite_1cyc_X_noRSV_206ln : + SchedWriteRes<[KryoUnitX]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_X_noRSV_206ln], + (instregex "(UZP1|UZP2|ZIP1|ZIP2)(v2i32|v4i16|v8i8)")>; +def KryoWrite_1cyc_XY_noRSV_215ln : + SchedWriteRes<[KryoUnitXY]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_XY_noRSV_215ln], + (instregex "XTNv.*")>; +def KryoWrite_1cyc_X_X_209ln : + SchedWriteRes<[KryoUnitX, KryoUnitX]> { + let Latency = 1; let NumMicroOps = 2; +} +def : InstRW<[KryoWrite_1cyc_X_X_209ln], + (instregex "ZIP1(v4i32|v8i16|v16i8)")>; diff --git a/lib/Target/AArch64/AArch64SchedM1.td b/lib/Target/AArch64/AArch64SchedM1.td index 6525628dbfd6e..2288b8dfc223c 100644 --- a/lib/Target/AArch64/AArch64SchedM1.td +++ b/lib/Target/AArch64/AArch64SchedM1.td @@ -19,9 +19,8 @@ def ExynosM1Model : SchedMachineModel { let IssueWidth = 4; // Up to 4 uops per cycle. - let MinLatency = 0; // OoO. let MicroOpBufferSize = 96; // ROB size. - let LoopMicroOpBufferSize = 32; // Instruction queue size. + let LoopMicroOpBufferSize = 24; // Based on the instruction queue size. let LoadLatency = 4; // Optimistic load cases. let MispredictPenalty = 14; // Minimum branch misprediction penalty. let CompleteModel = 0; // Use the default model otherwise. @@ -142,12 +141,13 @@ def : WriteRes<WriteVST, [M1UnitS, M1UnitFST]> { let Latency = 1; } def : WriteRes<WriteV, [M1UnitFADD]> { let Latency = 3; } // Other miscellaneous instructions. -def : WriteRes<WriteSys, []> { let Latency = 1; } +def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } def : WriteRes<WriteBarrier, []> { let Latency = 1; } def : WriteRes<WriteHint, []> { let Latency = 1; } +def : WriteRes<WriteSys, []> { let Latency = 1; } //===----------------------------------------------------------------------===// -// Fast forwarding. +// Generic fast forwarding. // TODO: Add FP register forwarding rules. @@ -187,6 +187,10 @@ def M1WriteNEONH : SchedWriteRes<[M1UnitNALU, M1UnitFST]> { let Latency = 3; } def M1WriteNEONI : SchedWriteRes<[M1UnitFST, M1UnitL]> { let Latency = 9; } +def M1WriteNEONJ : SchedWriteRes<[M1UnitNMISC, + M1UnitFMAC]> { let Latency = 6; } +def M1WriteNEONK : SchedWriteRes<[M1UnitNMISC, + M1UnitFMAC]> { let Latency = 7; } def M1WriteALU1 : SchedWriteRes<[M1UnitALU]> { let Latency = 1; } def M1WriteB : SchedWriteRes<[M1UnitB]> { let Latency = 1; } // FIXME: This is the worst case, conditional branch and link. @@ -305,8 +309,10 @@ def : InstRW<[M1WriteFVAR15], (instregex "FSQRTv.f32")>; def : InstRW<[M1WriteFVAR23], (instregex "FSQRTv2f64")>; def : InstRW<[M1WriteNMISC1], (instregex "^F(MAX|MIN)(NM)?V?v")>; def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN)(NM)?Pv")>; -def : InstRW<[M1WriteFMAC4], (instregex "^FMULX?v")>; -def : InstRW<[M1WriteFMAC5], (instregex "^FML[AS]v")>; +def : InstRW<[M1WriteNEONJ], (instregex "^FMULX?v.i")>; +def : InstRW<[M1WriteFMAC4], (instregex "^FMULX?v.f")>; +def : InstRW<[M1WriteNEONK], (instregex "^FML[AS]v.i")>; +def : InstRW<[M1WriteFMAC5], (instregex "^FML[AS]v.f")>; def : InstRW<[M1WriteFCVT3], (instregex "^FRINT[AIMNPXZ]v")>; // ASIMD miscellaneous instructions. @@ -337,16 +343,19 @@ def : InstRW<[WriteSequence<[M1WriteNAL12], 4>], (instregex "^TB[LX]v16i8Four")>; def : InstRW<[M1WriteNEOND], (instregex "^[SU]MOVv")>; def : InstRW<[M1WriteNALU1], (instregex "^INSv.+lane")>; -def : InstRW<[M1WriteNALU1], (instregex "^(TRN|UZP)(1|2)(v8i8|v4i16|v2i32)")>; -def : InstRW<[M1WriteNALU2], (instregex "^(TRN|UZP)(1|2)(v16i8|v8i16|v4i32|v2i64)")>; -def : InstRW<[M1WriteNALU1], (instregex "^ZIP(1|2)v")>; +def : InstRW<[M1WriteNALU1], (instregex "^(TRN|UZP)[12](v8i8|v4i16|v2i32)")>; +def : InstRW<[M1WriteNALU2], (instregex "^(TRN|UZP)[12](v16i8|v8i16|v4i32|v2i64)")>; +def : InstRW<[M1WriteNALU1], (instregex "^ZIP[12]v")>; // ASIMD load instructions. // ASIMD store instructions. // Cryptography instructions. -def : InstRW<[M1WriteNCRYPT1], (instregex "^AES")>; +def M1WriteAES : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; } +def M1ReadAES : SchedReadAdvance<1, [M1WriteAES]>; +def : InstRW<[M1WriteAES, M1ReadAES], (instregex "^AES")>; + def : InstRW<[M1WriteNCRYPT1], (instregex "^PMUL")>; def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA1(H|SU)")>; def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA1[CMP]")>; diff --git a/lib/Target/AArch64/AArch64SchedVulcan.td b/lib/Target/AArch64/AArch64SchedVulcan.td new file mode 100644 index 0000000000000..0aa2462eba837 --- /dev/null +++ b/lib/Target/AArch64/AArch64SchedVulcan.td @@ -0,0 +1,855 @@ +//=- AArch64SchedVulcan.td - Vulcan Scheduling Defs ----------*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// 1. Introduction +// +// This file defines the machine model for Broadcom Vulcan to support +// instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// 2. Pipeline Description. + +def VulcanModel : SchedMachineModel { + let IssueWidth = 4; // 4 micro-ops dispatched at a time. + let MicroOpBufferSize = 180; // 180 entries in micro-op re-order buffer. + let LoadLatency = 4; // Optimistic load latency. + let MispredictPenalty = 12; // Extra cycles for mispredicted branch. + // Determined via a mix of micro-arch details and experimentation. + let LoopMicroOpBufferSize = 32; + let PostRAScheduler = 1; // Using PostRA sched. + let CompleteModel = 1; +} + +// Define the issue ports. + +// Port 0: ALU, FP/SIMD. +def VulcanP0 : ProcResource<1>; + +// Port 1: ALU, FP/SIMD, integer mul/div. +def VulcanP1 : ProcResource<1>; + +// Port 2: ALU, Branch. +def VulcanP2 : ProcResource<1>; + +// Port 3: Store data. +def VulcanP3 : ProcResource<1>; + +// Port 4: Load/store. +def VulcanP4 : ProcResource<1>; + +// Port 5: Load/store. +def VulcanP5 : ProcResource<1>; + +let SchedModel = VulcanModel in { + +// Define groups for the functional units on each +// issue port. Each group created will be used +// by a WriteRes later on. +// +// NOTE: Some groups only contain one member. This +// is a way to create names for the various functional +// units that share a single issue port. For example, +// "VulcanI1" for ALU ops on port 1 and "VulcanF1" for +// FP ops on port 1. + +// Integer divide and multiply micro-ops only on port 1. +def VulcanI1 : ProcResGroup<[VulcanP1]>; + +// Branch micro-ops only on port 2. +def VulcanI2 : ProcResGroup<[VulcanP2]>; + +// ALU micro-ops on ports 0, 1, and 2. +def VulcanI012 : ProcResGroup<[VulcanP0, VulcanP1, VulcanP2]>; + +// Crypto FP/SIMD micro-ops only on port 1. +def VulcanF1 : ProcResGroup<[VulcanP1]>; + +// FP/SIMD micro-ops on ports 0 and 1. +def VulcanF01 : ProcResGroup<[VulcanP0, VulcanP1]>; + +// Store data micro-ops only on port 3. +def VulcanSD : ProcResGroup<[VulcanP3]>; + +// Load/store micro-ops on ports 4 and 5. +def VulcanLS01 : ProcResGroup<[VulcanP4, VulcanP5]>; + +// 60 entry unified scheduler. +def VulcanAny : ProcResGroup<[VulcanP0, VulcanP1, VulcanP2, + VulcanP3, VulcanP4, VulcanP5]> { + let BufferSize=60; +} + +// Define commonly used write types for InstRW specializations. +// All definitions follow the format: VulcanWrite_<NumCycles>Cyc_<Resources>. + +// 3 cycles on I1. +def VulcanWrite_3Cyc_I1 : SchedWriteRes<[VulcanI1]> { let Latency = 3; } + +// 4 cycles on I1. +def VulcanWrite_4Cyc_I1 : SchedWriteRes<[VulcanI1]> { let Latency = 4; } + +// 1 cycle on I0, I1, or I2. +def VulcanWrite_1Cyc_I012 : SchedWriteRes<[VulcanI012]> { let Latency = 1; } + +// 5 cycles on F1. +def VulcanWrite_5Cyc_F1 : SchedWriteRes<[VulcanF1]> { let Latency = 5; } + +// 7 cycles on F1. +def VulcanWrite_7Cyc_F1 : SchedWriteRes<[VulcanF1]> { let Latency = 7; } + +// 4 cycles on F0 or F1. +def VulcanWrite_4Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 4; } + +// 5 cycles on F0 or F1. +def VulcanWrite_5Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 5; } + +// 6 cycles on F0 or F1. +def VulcanWrite_6Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 6; } + +// 7 cycles on F0 or F1. +def VulcanWrite_7Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 7; } + +// 8 cycles on F0 or F1. +def VulcanWrite_8Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 8; } + +// 16 cycles on F0 or F1. +def VulcanWrite_16Cyc_F01 : SchedWriteRes<[VulcanF01]> { + let Latency = 16; + let ResourceCycles = [8]; +} + +// 23 cycles on F0 or F1. +def VulcanWrite_23Cyc_F01 : SchedWriteRes<[VulcanF01]> { + let Latency = 23; + let ResourceCycles = [11]; +} + +// 1 cycles on LS0 or LS1. +def VulcanWrite_1Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 1; } + +// 4 cycles on LS0 or LS1. +def VulcanWrite_4Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 4; } + +// 5 cycles on LS0 or LS1. +def VulcanWrite_5Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 5; } + +// 6 cycles on LS0 or LS1. +def VulcanWrite_6Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 6; } + +// 5 cycles on LS0 or LS1 and I0, I1, or I2. +def VulcanWrite_5Cyc_LS01_I012 : SchedWriteRes<[VulcanLS01, VulcanI012]> { + let Latency = 5; + let NumMicroOps = 2; +} + +// 5 cycles on LS0 or LS1 and 2 of I0, I1, or I2. +def VulcanWrite_6Cyc_LS01_I012_I012 : + SchedWriteRes<[VulcanLS01, VulcanI012, VulcanI012]> { + let Latency = 6; + let NumMicroOps = 3; +} + +// 1 cycles on LS0 or LS1 and F0 or F1. +def VulcanWrite_1Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// 5 cycles on LS0 or LS1 and F0 or F1. +def VulcanWrite_5Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> { + let Latency = 5; + let NumMicroOps = 2; +} + +// 6 cycles on LS0 or LS1 and F0 or F1. +def VulcanWrite_6Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> { + let Latency = 6; + let NumMicroOps = 2; +} + +// 7 cycles on LS0 or LS1 and F0 or F1. +def VulcanWrite_7Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> { + let Latency = 7; + let NumMicroOps = 2; +} + +// 8 cycles on LS0 or LS1 and F0 or F1. +def VulcanWrite_8Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> { + let Latency = 8; + let NumMicroOps = 2; +} + +// Define commonly used read types. + +// No forwarding is provided for these types. +def : ReadAdvance<ReadI, 0>; +def : ReadAdvance<ReadISReg, 0>; +def : ReadAdvance<ReadIEReg, 0>; +def : ReadAdvance<ReadIM, 0>; +def : ReadAdvance<ReadIMA, 0>; +def : ReadAdvance<ReadID, 0>; +def : ReadAdvance<ReadExtrHi, 0>; +def : ReadAdvance<ReadAdrBase, 0>; +def : ReadAdvance<ReadVLD, 0>; + +} + + +//===----------------------------------------------------------------------===// +// 3. Instruction Tables. + +let SchedModel = VulcanModel in { + +//--- +// 3.1 Branch Instructions +//--- + +// Branch, immed +// Branch and link, immed +// Compare and branch +def : WriteRes<WriteBr, [VulcanI2]> { let Latency = 1; } + +def : WriteRes<WriteSys, []> { let Latency = 1; } +def : WriteRes<WriteBarrier, []> { let Latency = 1; } +def : WriteRes<WriteHint, []> { let Latency = 1; } + +def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } + +// Branch, register +// Branch and link, register != LR +// Branch and link, register = LR +def : WriteRes<WriteBrReg, [VulcanI2]> { let Latency = 1; } + +//--- +// 3.2 Arithmetic and Logical Instructions +// 3.3 Move and Shift Instructions +//--- + +// ALU, basic +// Conditional compare +// Conditional select +// Address generation +def : WriteRes<WriteI, [VulcanI012]> { let Latency = 1; } +def : InstRW<[WriteI], (instrs COPY)>; + +// ALU, extend and/or shift +def : WriteRes<WriteISReg, [VulcanI012]> { + let Latency = 2; + let ResourceCycles = [2]; +} + +def : WriteRes<WriteIEReg, [VulcanI012]> { + let Latency = 2; + let ResourceCycles = [2]; +} + +// Move immed +def : WriteRes<WriteImm, [VulcanI012]> { let Latency = 1; } + +// Variable shift +def : WriteRes<WriteIS, [VulcanI012]> { let Latency = 1; } + +//--- +// 3.4 Divide and Multiply Instructions +//--- + +// Divide, W-form +// Latency range of 13-23. Take the average. +def : WriteRes<WriteID32, [VulcanI1]> { + let Latency = 18; + let ResourceCycles = [18]; +} + +// Divide, X-form +// Latency range of 13-39. Take the average. +def : WriteRes<WriteID64, [VulcanI1]> { + let Latency = 26; + let ResourceCycles = [26]; +} + +// Multiply accumulate, W-form +def : WriteRes<WriteIM32, [VulcanI012]> { let Latency = 5; } + +// Multiply accumulate, X-form +def : WriteRes<WriteIM64, [VulcanI012]> { let Latency = 5; } + +// Bitfield extract, two reg +def : WriteRes<WriteExtr, [VulcanI012]> { let Latency = 1; } + +// Bitfield move, basic +// Bitfield move, insert +// NOTE: Handled by WriteIS. + +// Count leading +def : InstRW<[VulcanWrite_3Cyc_I1], (instregex "^CLS(W|X)r$", + "^CLZ(W|X)r$")>; + +// Reverse bits/bytes +// NOTE: Handled by WriteI. + +//--- +// 3.6 Load Instructions +// 3.10 FP Load Instructions +//--- + +// Load register, literal +// Load register, unscaled immed +// Load register, immed unprivileged +// Load register, unsigned immed +def : WriteRes<WriteLD, [VulcanLS01]> { let Latency = 4; } + +// Load register, immed post-index +// NOTE: Handled by WriteLD, WriteI. +// Load register, immed pre-index +// NOTE: Handled by WriteLD, WriteAdr. +def : WriteRes<WriteAdr, [VulcanI012]> { let Latency = 1; } + +// Load register offset, basic +// Load register, register offset, scale by 4/8 +// Load register, register offset, scale by 2 +// Load register offset, extend +// Load register, register offset, extend, scale by 4/8 +// Load register, register offset, extend, scale by 2 +def VulcanWriteLDIdx : SchedWriteVariant<[ + SchedVar<ScaledIdxPred, [VulcanWrite_6Cyc_LS01_I012_I012]>, + SchedVar<NoSchedPred, [VulcanWrite_5Cyc_LS01_I012]>]>; +def : SchedAlias<WriteLDIdx, VulcanWriteLDIdx>; + +def VulcanReadAdrBase : SchedReadVariant<[ + SchedVar<ScaledIdxPred, [ReadDefault]>, + SchedVar<NoSchedPred, [ReadDefault]>]>; +def : SchedAlias<ReadAdrBase, VulcanReadAdrBase>; + +// Load pair, immed offset, normal +// Load pair, immed offset, signed words, base != SP +// Load pair, immed offset signed words, base = SP +// LDP only breaks into *one* LS micro-op. Thus +// the resources are handling by WriteLD. +def : WriteRes<WriteLDHi, []> { + let Latency = 5; +} + +// Load pair, immed pre-index, normal +// Load pair, immed pre-index, signed words +// Load pair, immed post-index, normal +// Load pair, immed post-index, signed words +// NOTE: Handled by WriteLD, WriteLDHi, WriteAdr. + +//-- +// 3.7 Store Instructions +// 3.11 FP Store Instructions +//-- + +// Store register, unscaled immed +// Store register, immed unprivileged +// Store register, unsigned immed +def : WriteRes<WriteST, [VulcanLS01, VulcanSD]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// Store register, immed post-index +// NOTE: Handled by WriteAdr, WriteST, ReadAdrBase + +// Store register, immed pre-index +// NOTE: Handled by WriteAdr, WriteST + +// Store register, register offset, basic +// Store register, register offset, scaled by 4/8 +// Store register, register offset, scaled by 2 +// Store register, register offset, extend +// Store register, register offset, extend, scale by 4/8 +// Store register, register offset, extend, scale by 1 +def : WriteRes<WriteSTIdx, [VulcanLS01, VulcanSD, VulcanI012]> { + let Latency = 1; + let NumMicroOps = 3; +} + +// Store pair, immed offset, W-form +// Store pair, immed offset, X-form +def : WriteRes<WriteSTP, [VulcanLS01, VulcanSD]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// Store pair, immed post-index, W-form +// Store pair, immed post-index, X-form +// Store pair, immed pre-index, W-form +// Store pair, immed pre-index, X-form +// NOTE: Handled by WriteAdr, WriteSTP. + +//--- +// 3.8 FP Data Processing Instructions +//--- + +// FP absolute value +// FP min/max +// FP negate +def : WriteRes<WriteF, [VulcanF01]> { let Latency = 5; } + +// FP arithmetic +def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FADD", "^FSUB")>; + +// FP compare +def : WriteRes<WriteFCmp, [VulcanF01]> { let Latency = 5; } + +// FP divide, S-form +// FP square root, S-form +def : WriteRes<WriteFDiv, [VulcanF01]> { + let Latency = 16; + let ResourceCycles = [8]; +} + +// FP divide, D-form +// FP square root, D-form +def : InstRW<[VulcanWrite_23Cyc_F01], (instrs FDIVDrr, FSQRTDr)>; + +// FP multiply +// FP multiply accumulate +def : WriteRes<WriteFMul, [VulcanF01]> { let Latency = 6; } + +// FP round to integral +def : InstRW<[VulcanWrite_7Cyc_F01], + (instregex "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>; + +// FP select +def : InstRW<[VulcanWrite_4Cyc_F01], (instregex "^FCSEL")>; + +//--- +// 3.9 FP Miscellaneous Instructions +//--- + +// FP convert, from vec to vec reg +// FP convert, from gen to vec reg +// FP convert, from vec to gen reg +def : WriteRes<WriteFCvt, [VulcanF01]> { let Latency = 7; } + +// FP move, immed +// FP move, register +def : WriteRes<WriteFImm, [VulcanF01]> { let Latency = 4; } + +// FP transfer, from gen to vec reg +// FP transfer, from vec to gen reg +def : WriteRes<WriteFCopy, [VulcanF01]> { let Latency = 4; } +def : InstRW<[VulcanWrite_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>; + +//--- +// 3.12 ASIMD Integer Instructions +//--- + +// ASIMD absolute diff, D-form +// ASIMD absolute diff, Q-form +// ASIMD absolute diff accum, D-form +// ASIMD absolute diff accum, Q-form +// ASIMD absolute diff accum long +// ASIMD absolute diff long +// ASIMD arith, basic +// ASIMD arith, complex +// ASIMD compare +// ASIMD logical (AND, BIC, EOR) +// ASIMD max/min, basic +// ASIMD max/min, reduce, 4H/4S +// ASIMD max/min, reduce, 8B/8H +// ASIMD max/min, reduce, 16B +// ASIMD multiply, D-form +// ASIMD multiply, Q-form +// ASIMD multiply accumulate long +// ASIMD multiply accumulate saturating long +// ASIMD multiply long +// ASIMD pairwise add and accumulate +// ASIMD shift accumulate +// ASIMD shift by immed, basic +// ASIMD shift by immed and insert, basic, D-form +// ASIMD shift by immed and insert, basic, Q-form +// ASIMD shift by immed, complex +// ASIMD shift by register, basic, D-form +// ASIMD shift by register, basic, Q-form +// ASIMD shift by register, complex, D-form +// ASIMD shift by register, complex, Q-form +def : WriteRes<WriteV, [VulcanF01]> { let Latency = 7; } + +// ASIMD arith, reduce, 4H/4S +// ASIMD arith, reduce, 8B/8H +// ASIMD arith, reduce, 16B +def : InstRW<[VulcanWrite_5Cyc_F01], + (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>; + +// ASIMD logical (MOV, MVN, ORN, ORR) +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ORRv", "^ORNv", "^NOTv")>; + +// ASIMD polynomial (8x8) multiply long +def : InstRW<[VulcanWrite_5Cyc_F01], (instrs PMULLv8i8, PMULLv16i8)>; + +//--- +// 3.13 ASIMD Floating-point Instructions +//--- + +// ASIMD FP absolute value +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FABSv")>; + +// ASIMD FP arith, normal, D-form +// ASIMD FP arith, normal, Q-form +def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FABDv", "^FADDv", "^FSUBv")>; + +// ASIMD FP arith,pairwise, D-form +// ASIMD FP arith, pairwise, Q-form +def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FADDPv")>; + +// ASIMD FP compare, D-form +// ASIMD FP compare, Q-form +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FACGEv", "^FACGTv")>; +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FCMEQv", "^FCMGEv", + "^FCMGTv", "^FCMLEv", + "^FCMLTv")>; + +// ASIMD FP convert, long +// ASIMD FP convert, narrow +// ASIMD FP convert, other, D-form +// ASIMD FP convert, other, Q-form +// NOTE: Handled by WriteV. + +// ASIMD FP divide, D-form, F32 +def : InstRW<[VulcanWrite_16Cyc_F01], (instrs FDIVv2f32)>; + +// ASIMD FP divide, Q-form, F32 +def : InstRW<[VulcanWrite_16Cyc_F01], (instrs FDIVv4f32)>; + +// ASIMD FP divide, Q-form, F64 +def : InstRW<[VulcanWrite_23Cyc_F01], (instrs FDIVv2f64)>; + +// ASIMD FP max/min, normal, D-form +// ASIMD FP max/min, normal, Q-form +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXv", "^FMAXNMv", + "^FMINv", "^FMINNMv")>; + +// ASIMD FP max/min, pairwise, D-form +// ASIMD FP max/min, pairwise, Q-form +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXPv", "^FMAXNMPv", + "^FMINPv", "^FMINNMPv")>; + +// ASIMD FP max/min, reduce +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXVv", "^FMAXNMVv", + "^FMINVv", "^FMINNMVv")>; + +// ASIMD FP multiply, D-form, FZ +// ASIMD FP multiply, D-form, no FZ +// ASIMD FP multiply, Q-form, FZ +// ASIMD FP multiply, Q-form, no FZ +def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FMULv", "^FMULXv")>; + +// ASIMD FP multiply accumulate, Dform, FZ +// ASIMD FP multiply accumulate, Dform, no FZ +// ASIMD FP multiply accumulate, Qform, FZ +// ASIMD FP multiply accumulate, Qform, no FZ +def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FMLAv", "^FMLSv")>; + +// ASIMD FP negate +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FNEGv")>; + +// ASIMD FP round, D-form +// ASIMD FP round, Q-form +// NOTE: Handled by WriteV. + +//-- +// 3.14 ASIMD Miscellaneous Instructions +//-- + +// ASIMD bit reverse +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^RBITv")>; + +// ASIMD bitwise insert, D-form +// ASIMD bitwise insert, Q-form +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^BIFv", "^BITv", "^BSLv")>; + +// ASIMD count, D-form +// ASIMD count, Q-form +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^CLSv", "^CLZv", "^CNTv")>; + +// ASIMD duplicate, gen reg +// ASIMD duplicate, element +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^DUPv")>; + +// ASIMD extract +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^EXTv")>; + +// ASIMD extract narrow +// ASIMD extract narrow, saturating +// NOTE: Handled by WriteV. + +// ASIMD insert, element to element +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^INSv")>; + +// ASIMD move, integer immed +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^MOVIv", "^MOVIDv")>; + +// ASIMD move, FP immed +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMOVv")>; + +// ASIMD reciprocal estimate, D-form +// ASIMD reciprocal estimate, Q-form +def : InstRW<[VulcanWrite_5Cyc_F01], + (instregex "^FRECPEv", "^FRECPXv", "^URECPEv", + "^FRSQRTEv", "^URSQRTEv")>; + +// ASIMD reciprocal step, D-form, FZ +// ASIMD reciprocal step, D-form, no FZ +// ASIMD reciprocal step, Q-form, FZ +// ASIMD reciprocal step, Q-form, no FZ +def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FRECPSv", "^FRSQRTSv")>; + +// ASIMD reverse +def : InstRW<[VulcanWrite_5Cyc_F01], + (instregex "^REV16v", "^REV32v", "^REV64v")>; + +// ASIMD table lookup, D-form +// ASIMD table lookup, Q-form +def : InstRW<[VulcanWrite_8Cyc_F01], (instregex "^TBLv", "^TBXv")>; + +// ASIMD transfer, element to word or word +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^UMOVv")>; + +// ASIMD transfer, element to gen reg +def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^SMOVv", "^UMOVv")>; + +// ASIMD transfer gen reg to element +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^INSv")>; + +// ASIMD transpose +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^TRN1v", "^TRN2v", + "^UZP1v", "^UZP2v")>; + +// ASIMD unzip/zip +def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ZIP1v", "^ZIP2v")>; + +//-- +// 3.15 ASIMD Load Instructions +//-- + +// ASIMD load, 1 element, multiple, 1 reg, D-form +// ASIMD load, 1 element, multiple, 1 reg, Q-form +def : InstRW<[VulcanWrite_4Cyc_LS01], + (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_4Cyc_LS01, WriteAdr], + (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 2 reg, D-form +// ASIMD load, 1 element, multiple, 2 reg, Q-form +def : InstRW<[VulcanWrite_4Cyc_LS01], + (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_4Cyc_LS01, WriteAdr], + (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 3 reg, D-form +// ASIMD load, 1 element, multiple, 3 reg, Q-form +def : InstRW<[VulcanWrite_5Cyc_LS01], + (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_5Cyc_LS01, WriteAdr], + (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 4 reg, D-form +// ASIMD load, 1 element, multiple, 4 reg, Q-form +def : InstRW<[VulcanWrite_6Cyc_LS01], + (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_6Cyc_LS01, WriteAdr], + (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, one lane, B/H/S +// ASIMD load, 1 element, one lane, D +def : InstRW<[VulcanWrite_5Cyc_LS01_F01], (instregex "^LD1i(8|16|32|64)$")>; +def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], + (instregex "^LD1i(8|16|32|64)_POST$")>; + +// ASIMD load, 1 element, all lanes, D-form, B/H/S +// ASIMD load, 1 element, all lanes, D-form, D +// ASIMD load, 1 element, all lanes, Q-form +def : InstRW<[VulcanWrite_5Cyc_LS01_F01], + (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], + (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 2 element, multiple, D-form, B/H/S +// ASIMD load, 2 element, multiple, Q-form, D +def : InstRW<[VulcanWrite_5Cyc_LS01_F01], + (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], + (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 2 element, one lane, B/H +// ASIMD load, 2 element, one lane, S +// ASIMD load, 2 element, one lane, D +def : InstRW<[VulcanWrite_5Cyc_LS01_F01], (instregex "^LD2i(8|16|32|64)$")>; +def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], + (instregex "^LD2i(8|16|32|64)_POST$")>; + +// ASIMD load, 2 element, all lanes, D-form, B/H/S +// ASIMD load, 2 element, all lanes, D-form, D +// ASIMD load, 2 element, all lanes, Q-form +def : InstRW<[VulcanWrite_5Cyc_LS01_F01], + (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], + (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 3 element, multiple, D-form, B/H/S +// ASIMD load, 3 element, multiple, Q-form, B/H/S +// ASIMD load, 3 element, multiple, Q-form, D +def : InstRW<[VulcanWrite_8Cyc_LS01_F01], + (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_8Cyc_LS01_F01, WriteAdr], + (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 3 element, one lone, B/H +// ASIMD load, 3 element, one lane, S +// ASIMD load, 3 element, one lane, D +def : InstRW<[VulcanWrite_7Cyc_LS01_F01], (instregex "^LD3i(8|16|32|64)$")>; +def : InstRW<[VulcanWrite_7Cyc_LS01_F01, WriteAdr], + (instregex "^LD3i(8|16|32|64)_POST$")>; + +// ASIMD load, 3 element, all lanes, D-form, B/H/S +// ASIMD load, 3 element, all lanes, D-form, D +// ASIMD load, 3 element, all lanes, Q-form, B/H/S +// ASIMD load, 3 element, all lanes, Q-form, D +def : InstRW<[VulcanWrite_7Cyc_LS01_F01], + (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_7Cyc_LS01_F01, WriteAdr], + (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 4 element, multiple, D-form, B/H/S +// ASIMD load, 4 element, multiple, Q-form, B/H/S +// ASIMD load, 4 element, multiple, Q-form, D +def : InstRW<[VulcanWrite_8Cyc_LS01_F01], + (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_8Cyc_LS01_F01, WriteAdr], + (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 4 element, one lane, B/H +// ASIMD load, 4 element, one lane, S +// ASIMD load, 4 element, one lane, D +def : InstRW<[VulcanWrite_6Cyc_LS01_F01], (instregex "^LD4i(8|16|32|64)$")>; +def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr], + (instregex "^LD4i(8|16|32|64)_POST$")>; + +// ASIMD load, 4 element, all lanes, D-form, B/H/S +// ASIMD load, 4 element, all lanes, D-form, D +// ASIMD load, 4 element, all lanes, Q-form, B/H/S +// ASIMD load, 4 element, all lanes, Q-form, D +def : InstRW<[VulcanWrite_6Cyc_LS01_F01], + (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr], + (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +//-- +// 3.16 ASIMD Store Instructions +//-- + +// ASIMD store, 1 element, multiple, 1 reg, D-form +// ASIMD store, 1 element, multiple, 1 reg, Q-form +def : InstRW<[VulcanWrite_1Cyc_LS01], + (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], + (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 2 reg, D-form +// ASIMD store, 1 element, multiple, 2 reg, Q-form +def : InstRW<[VulcanWrite_1Cyc_LS01], + (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], + (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 3 reg, D-form +// ASIMD store, 1 element, multiple, 3 reg, Q-form +def : InstRW<[VulcanWrite_1Cyc_LS01], + (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], + (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 4 reg, D-form +// ASIMD store, 1 element, multiple, 4 reg, Q-form +def : InstRW<[VulcanWrite_1Cyc_LS01], + (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], + (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, one lane, B/H/S +// ASIMD store, 1 element, one lane, D +def : InstRW<[VulcanWrite_1Cyc_LS01_F01], + (instregex "^ST1i(8|16|32|64)$")>; +def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], + (instregex "^ST1i(8|16|32|64)_POST$")>; + +// ASIMD store, 2 element, multiple, D-form, B/H/S +// ASIMD store, 2 element, multiple, Q-form, B/H/S +// ASIMD store, 2 element, multiple, Q-form, D +def : InstRW<[VulcanWrite_1Cyc_LS01_F01], + (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], + (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 2 element, one lane, B/H/S +// ASIMD store, 2 element, one lane, D +def : InstRW<[VulcanWrite_1Cyc_LS01_F01], + (instregex "^ST2i(8|16|32|64)$")>; +def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], + (instregex "^ST2i(8|16|32|64)_POST$")>; + +// ASIMD store, 3 element, multiple, D-form, B/H/S +// ASIMD store, 3 element, multiple, Q-form, B/H/S +// ASIMD store, 3 element, multiple, Q-form, D +def : InstRW<[VulcanWrite_1Cyc_LS01_F01], + (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], + (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 3 element, one lane, B/H +// ASIMD store, 3 element, one lane, S +// ASIMD store, 3 element, one lane, D +def : InstRW<[VulcanWrite_1Cyc_LS01_F01], (instregex "^ST3i(8|16|32|64)$")>; +def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], + (instregex "^ST3i(8|16|32|64)_POST$")>; + +// ASIMD store, 4 element, multiple, D-form, B/H/S +// ASIMD store, 4 element, multiple, Q-form, B/H/S +// ASIMD store, 4 element, multiple, Q-form, D +def : InstRW<[VulcanWrite_1Cyc_LS01_F01], + (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], + (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 4 element, one lane, B/H +// ASIMD store, 4 element, one lane, S +// ASIMD store, 4 element, one lane, D +def : InstRW<[VulcanWrite_1Cyc_LS01_F01], (instregex "^ST4i(8|16|32|64)$")>; +def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], + (instregex "^ST4i(8|16|32|64)_POST$")>; + +//-- +// 3.17 Cryptography Extensions +//-- + +// Crypto AES ops +def : InstRW<[VulcanWrite_5Cyc_F1], (instregex "^AES")>; + +// Crypto polynomial (64x64) multiply long +def : InstRW<[VulcanWrite_5Cyc_F1], (instrs PMULLv1i64, PMULLv2i64)>; + +// Crypto SHA1 xor ops +// Crypto SHA1 schedule acceleration ops +// Crypto SHA256 schedule acceleration op (1 u-op) +// Crypto SHA256 schedule acceleration op (2 u-ops) +// Crypto SHA256 hash acceleration ops +def : InstRW<[VulcanWrite_7Cyc_F1], (instregex "^SHA")>; + +//-- +// 3.18 CRC +//-- + +// CRC checksum ops +def : InstRW<[VulcanWrite_4Cyc_I1], (instregex "^CRC32")>; + +} // SchedModel = VulcanModel diff --git a/lib/Target/AArch64/AArch64Schedule.td b/lib/Target/AArch64/AArch64Schedule.td index eaa9110ab1bc6..ce81f48acf712 100644 --- a/lib/Target/AArch64/AArch64Schedule.td +++ b/lib/Target/AArch64/AArch64Schedule.td @@ -51,15 +51,15 @@ def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled). def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST. // Predicate for determining when a shiftable register is shifted. -def RegShiftedPred : SchedPredicate<[{TII->hasShiftedReg(MI)}]>; +def RegShiftedPred : SchedPredicate<[{TII->hasShiftedReg(*MI)}]>; // Predicate for determining when a extendedable register is extended. -def RegExtendedPred : SchedPredicate<[{TII->hasExtendedReg(MI)}]>; +def RegExtendedPred : SchedPredicate<[{TII->hasExtendedReg(*MI)}]>; // ScaledIdxPred is true if a WriteLDIdx operand will be // scaled. Subtargets can use this to dynamically select resources and // latency for WriteLDIdx and ReadAdrBase. -def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(MI)}]>; +def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(*MI)}]>; // Serialized two-level address load. // EXAMPLE: LOADGot @@ -92,6 +92,8 @@ def WriteV : SchedWrite; // Vector ops. def WriteVLD : SchedWrite; // Vector loads. def WriteVST : SchedWrite; // Vector stores. +def WriteAtomic : SchedWrite; // Atomic memory operations (CAS, Swap, LDOP) + // Read the unwritten lanes of the VLD's destination registers. def ReadVLD : SchedRead; diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index f40293021d74e..66a8f332513a7 100644 --- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -17,7 +17,7 @@ using namespace llvm; #define DEBUG_TYPE "aarch64-selectiondag-info" SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( - SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, + SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVolatile, MachinePointerInfo DstPtrInfo) const { // Check to see if there is a specialized entry-point for memory zeroing. @@ -44,10 +44,16 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(Chain) .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args), 0) + DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args)) .setDiscardResult(); std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); return CallResult.second; } return SDValue(); } +bool AArch64SelectionDAGInfo::generateFMAsInMachineCombiner( + CodeGenOpt::Level OptLevel) const { + if (OptLevel >= CodeGenOpt::Aggressive) + return true; + return false; +} diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/lib/Target/AArch64/AArch64SelectionDAGInfo.h index 97421b45b122e..7e4f11091226d 100644 --- a/lib/Target/AArch64/AArch64SelectionDAGInfo.h +++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.h @@ -7,24 +7,24 @@ // //===----------------------------------------------------------------------===// // -// This file defines the AArch64 subclass for TargetSelectionDAGInfo. +// This file defines the AArch64 subclass for SelectionDAGTargetInfo. // //===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64SELECTIONDAGINFO_H #define LLVM_LIB_TARGET_AARCH64_AARCH64SELECTIONDAGINFO_H -#include "llvm/Target/TargetSelectionDAGInfo.h" +#include "llvm/CodeGen/SelectionDAGTargetInfo.h" namespace llvm { -class AArch64SelectionDAGInfo : public TargetSelectionDAGInfo { +class AArch64SelectionDAGInfo : public SelectionDAGTargetInfo { public: - - SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain, - SDValue Dst, SDValue Src, SDValue Size, - unsigned Align, bool isVolatile, + SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl, + SDValue Chain, SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, bool isVolatile, MachinePointerInfo DstPtrInfo) const override; + bool generateFMAsInMachineCombiner(CodeGenOpt::Level OptLevel) const override; }; } diff --git a/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp index 1c6b15790ea98..f904b23794169 100644 --- a/lib/Target/AArch64/AArch64StorePairSuppress.cpp +++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp @@ -115,6 +115,9 @@ bool AArch64StorePairSuppress::isNarrowFPStore(const MachineInstr &MI) { } bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(*MF.getFunction())) + return false; + const TargetSubtargetInfo &ST = MF.getSubtarget(); TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo()); TRI = ST.getRegisterInfo(); @@ -141,8 +144,8 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) { if (!isNarrowFPStore(MI)) continue; unsigned BaseReg; - unsigned Offset; - if (TII->getMemOpBaseRegImmOfs(&MI, BaseReg, Offset, TRI)) { + int64_t Offset; + if (TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI)) { if (PrevBaseReg == BaseReg) { // If this block can take STPs, skip ahead to the next block. if (!SuppressSTP && shouldAddSTPToBlock(MI.getParent())) @@ -150,7 +153,7 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) { // Otherwise, continue unpairing the stores in this block. DEBUG(dbgs() << "Unpairing store " << MI << "\n"); SuppressSTP = true; - TII->suppressLdStPair(&MI); + TII->suppressLdStPair(MI); } PrevBaseReg = BaseReg; } else diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp index f6ee8cf47a6a4..7dd8ccbe6c25e 100644 --- a/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -11,10 +11,9 @@ // //===----------------------------------------------------------------------===// +#include "AArch64Subtarget.h" #include "AArch64InstrInfo.h" #include "AArch64PBQPRegAlloc.h" -#include "AArch64Subtarget.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/IR/GlobalValue.h" #include "llvm/Support/TargetRegistry.h" @@ -44,58 +43,83 @@ AArch64Subtarget::initializeSubtargetDependencies(StringRef FS) { CPUString = "generic"; ParseSubtargetFeatures(CPUString, FS); + initializeProperties(); + return *this; } +void AArch64Subtarget::initializeProperties() { + // Initialize CPU specific properties. We should add a tablegen feature for + // this in the future so we can specify it together with the subtarget + // features. + switch (ARMProcFamily) { + case Cyclone: + CacheLineSize = 64; + PrefetchDistance = 280; + MinPrefetchStride = 2048; + MaxPrefetchIterationsAhead = 3; + break; + case CortexA57: + MaxInterleaveFactor = 4; + break; + case ExynosM1: + PrefFunctionAlignment = 4; + PrefLoopAlignment = 3; + break; + case Kryo: + MaxInterleaveFactor = 4; + VectorInsertExtractBaseCost = 2; + CacheLineSize = 128; + PrefetchDistance = 740; + MinPrefetchStride = 1024; + MaxPrefetchIterationsAhead = 11; + break; + case Vulcan: + MaxInterleaveFactor = 4; + break; + case CortexA35: break; + case CortexA53: break; + case CortexA72: break; + case CortexA73: break; + case Others: break; + } +} + AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM, bool LittleEndian) - : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others), - HasV8_1aOps(false), HasV8_2aOps(false), HasFPARMv8(false), HasNEON(false), - HasCrypto(false), HasCRC(false), HasPerfMon(false), HasFullFP16(false), - HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), - StrictAlign(false), ReserveX18(TT.isOSDarwin()), IsLittle(LittleEndian), - CPUString(CPU), TargetTriple(TT), FrameLowering(), + : AArch64GenSubtargetInfo(TT, CPU, FS), ReserveX18(TT.isOSDarwin()), + IsLittle(LittleEndian), CPUString(CPU), TargetTriple(TT), FrameLowering(), InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(), - TLInfo(TM, *this) {} + TLInfo(TM, *this), GISel() {} + +const CallLowering *AArch64Subtarget::getCallLowering() const { + assert(GISel && "Access to GlobalISel APIs not set"); + return GISel->getCallLowering(); +} + +const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const { + assert(GISel && "Access to GlobalISel APIs not set"); + return GISel->getRegBankInfo(); +} -/// ClassifyGlobalReference - Find the target operand flags that describe -/// how a global value should be referenced for the current subtarget. +/// Find the target operand flags that describe how a global value should be +/// referenced for the current subtarget. unsigned char AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, - const TargetMachine &TM) const { - bool isDef = GV->isStrongDefinitionForLinker(); - + const TargetMachine &TM) const { // MachO large model always goes via a GOT, simply to get a single 8-byte // absolute relocation on all global addresses. if (TM.getCodeModel() == CodeModel::Large && isTargetMachO()) return AArch64II::MO_GOT; + if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) + return AArch64II::MO_GOT; + // The small code mode's direct accesses use ADRP, which cannot necessarily // produce the value 0 (if the code is above 4GB). - if (TM.getCodeModel() == CodeModel::Small && GV->hasExternalWeakLinkage()) { - // In PIC mode use the GOT, but in absolute mode use a constant pool load. - if (TM.getRelocationModel() == Reloc::Static) - return AArch64II::MO_CONSTPOOL; - else - return AArch64II::MO_GOT; - } - - // If symbol visibility is hidden, the extra load is not needed if - // the symbol is definitely defined in the current translation unit. - - // The handling of non-hidden symbols in PIC mode is rather target-dependent: - // + On MachO, if the symbol is defined in this module the GOT can be - // skipped. - // + On ELF, the R_AARCH64_COPY relocation means that even symbols actually - // defined could end up in unexpected places. Use a GOT. - if (TM.getRelocationModel() != Reloc::Static && GV->hasDefaultVisibility()) { - if (isTargetMachO()) - return isDef ? AArch64II::MO_NO_FLAG : AArch64II::MO_GOT; - else - // No need to go through the GOT for local symbols on ELF. - return GV->hasLocalLinkage() ? AArch64II::MO_NO_FLAG : AArch64II::MO_GOT; - } + if (TM.getCodeModel() == CodeModel::Small && GV->hasExternalWeakLinkage()) + return AArch64II::MO_GOT; return AArch64II::MO_NO_FLAG; } @@ -114,8 +138,7 @@ const char *AArch64Subtarget::getBZeroEntry() const { } void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, - MachineInstr *begin, MachineInstr *end, - unsigned NumRegionInstrs) const { + unsigned NumRegionInstrs) const { // LNT run (at least on Cyclone) showed reasonably significant gains for // bi-directional scheduling. 253.perlbmk. Policy.OnlyTopDown = false; @@ -123,8 +146,7 @@ void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, // Enabling or Disabling the latency heuristic is a close call: It seems to // help nearly no benchmark on out-of-order architectures, on the other hand // it regresses register pressure on a few benchmarking. - if (isCyclone()) - Policy.DisableLatencyHeuristic = true; + Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic; } bool AArch64Subtarget::enableEarlyIfConversion() const { @@ -146,8 +168,5 @@ bool AArch64Subtarget::supportsAddressTopByteIgnored() const { std::unique_ptr<PBQPRAConstraint> AArch64Subtarget::getCustomPBQPConstraints() const { - if (!isCortexA57()) - return nullptr; - - return llvm::make_unique<A57ChainingConstraint>(); + return balanceFPOps() ? llvm::make_unique<A57ChainingConstraint>() : nullptr; } diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h index 151133b2f32c9..16a35405c8924 100644 --- a/lib/Target/AArch64/AArch64Subtarget.h +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -19,6 +19,7 @@ #include "AArch64InstrInfo.h" #include "AArch64RegisterInfo.h" #include "AArch64SelectionDAGInfo.h" +#include "llvm/CodeGen/GlobalISel/GISelAccessor.h" #include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetSubtargetInfo.h" #include <string> @@ -32,38 +33,64 @@ class StringRef; class Triple; class AArch64Subtarget : public AArch64GenSubtargetInfo { -protected: - enum ARMProcFamilyEnum { +public: + enum ARMProcFamilyEnum : uint8_t { Others, CortexA35, CortexA53, CortexA57, + CortexA72, + CortexA73, Cyclone, - ExynosM1 + ExynosM1, + Kryo, + Vulcan }; +protected: /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others. - ARMProcFamilyEnum ARMProcFamily; + ARMProcFamilyEnum ARMProcFamily = Others; - bool HasV8_1aOps; - bool HasV8_2aOps; + bool HasV8_1aOps = false; + bool HasV8_2aOps = false; - bool HasFPARMv8; - bool HasNEON; - bool HasCrypto; - bool HasCRC; - bool HasPerfMon; - bool HasFullFP16; - bool HasSPE; + bool HasFPARMv8 = false; + bool HasNEON = false; + bool HasCrypto = false; + bool HasCRC = false; + bool HasRAS = false; + bool HasPerfMon = false; + bool HasFullFP16 = false; + bool HasSPE = false; // HasZeroCycleRegMove - Has zero-cycle register mov instructions. - bool HasZeroCycleRegMove; + bool HasZeroCycleRegMove = false; // HasZeroCycleZeroing - Has zero-cycle zeroing instructions. - bool HasZeroCycleZeroing; + bool HasZeroCycleZeroing = false; // StrictAlign - Disallow unaligned memory accesses. - bool StrictAlign; + bool StrictAlign = false; + bool MergeNarrowLoads = false; + bool UseAA = false; + bool PredictableSelectIsExpensive = false; + bool BalanceFPOps = false; + bool CustomAsCheapAsMove = false; + bool UsePostRAScheduler = false; + bool Misaligned128StoreIsSlow = false; + bool AvoidQuadLdStPairs = false; + bool UseAlternateSExtLoadCVTF32Pattern = false; + bool HasMacroOpFusion = false; + bool DisableLatencySchedHeuristic = false; + bool UseRSqrt = false; + uint8_t MaxInterleaveFactor = 2; + uint8_t VectorInsertExtractBaseCost = 3; + uint16_t CacheLineSize = 0; + uint16_t PrefetchDistance = 0; + uint16_t MinPrefetchStride = 1; + unsigned MaxPrefetchIterationsAhead = UINT_MAX; + unsigned PrefFunctionAlignment = 0; + unsigned PrefLoopAlignment = 0; // ReserveX18 - X18 is not available as a general purpose register. bool ReserveX18; @@ -80,12 +107,20 @@ protected: AArch64InstrInfo InstrInfo; AArch64SelectionDAGInfo TSInfo; AArch64TargetLowering TLInfo; + /// Gather the accessor points to GlobalISel-related APIs. + /// This is used to avoid ifndefs spreading around while GISel is + /// an optional library. + std::unique_ptr<GISelAccessor> GISel; + private: /// initializeSubtargetDependencies - Initializes using CPUString and the /// passed in feature string so that we can use initializer lists for /// subtarget initialization. AArch64Subtarget &initializeSubtargetDependencies(StringRef FS); + /// Initialize properties based on the selected processor family. + void initializeProperties(); + public: /// This constructor initializes the data members to match that /// of the specified triple. @@ -93,6 +128,11 @@ public: const std::string &FS, const TargetMachine &TM, bool LittleEndian); + /// This object will take onwership of \p GISelAccessor. + void setGISelAccessor(GISelAccessor &GISel) { + this->GISel.reset(&GISel); + } + const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override { return &TSInfo; } @@ -106,10 +146,20 @@ public: const AArch64RegisterInfo *getRegisterInfo() const override { return &getInstrInfo()->getRegisterInfo(); } + const CallLowering *getCallLowering() const override; + const RegisterBankInfo *getRegBankInfo() const override; const Triple &getTargetTriple() const { return TargetTriple; } bool enableMachineScheduler() const override { return true; } bool enablePostRAScheduler() const override { - return isGeneric() || isCortexA53() || isCortexA57(); + return UsePostRAScheduler; + } + + /// Returns ARM processor family. + /// Avoid this function! CPU specifics should be kept local to this class + /// and preferably modeled with SubtargetFeatures or properties in + /// initializeProperties(). + ARMProcFamilyEnum getProcFamily() const { + return ARMProcFamily; } bool hasV8_1aOps() const { return HasV8_1aOps; } @@ -126,6 +176,33 @@ public: bool hasNEON() const { return HasNEON; } bool hasCrypto() const { return HasCrypto; } bool hasCRC() const { return HasCRC; } + bool hasRAS() const { return HasRAS; } + bool mergeNarrowLoads() const { return MergeNarrowLoads; } + bool balanceFPOps() const { return BalanceFPOps; } + bool predictableSelectIsExpensive() const { + return PredictableSelectIsExpensive; + } + bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; } + bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; } + bool avoidQuadLdStPairs() const { return AvoidQuadLdStPairs; } + bool useAlternateSExtLoadCVTF32Pattern() const { + return UseAlternateSExtLoadCVTF32Pattern; + } + bool hasMacroOpFusion() const { return HasMacroOpFusion; } + bool useRSqrt() const { return UseRSqrt; } + unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; } + unsigned getVectorInsertExtractBaseCost() const { + return VectorInsertExtractBaseCost; + } + unsigned getCacheLineSize() const { return CacheLineSize; } + unsigned getPrefetchDistance() const { return PrefetchDistance; } + unsigned getMinPrefetchStride() const { return MinPrefetchStride; } + unsigned getMaxPrefetchIterationsAhead() const { + return MaxPrefetchIterationsAhead; + } + unsigned getPrefFunctionAlignment() const { return PrefFunctionAlignment; } + unsigned getPrefLoopAlignment() const { return PrefLoopAlignment; } + /// CPU has TBI (top byte of addresses is ignored during HW address /// translation) and OS enables it. bool supportsAddressTopByteIgnored() const; @@ -146,13 +223,7 @@ public: bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } - bool isGeneric() const { return CPUString == "generic"; } - bool isCyclone() const { return CPUString == "cyclone"; } - bool isCortexA57() const { return CPUString == "cortex-a57"; } - bool isCortexA53() const { return CPUString == "cortex-a53"; } - bool isExynosM1() const { return CPUString == "exynos-m1"; } - - bool useAA() const override { return isCortexA53(); } + bool useAA() const override { return UseAA; } /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size /// that still makes it profitable to inline the call. @@ -174,8 +245,7 @@ public: /// returns null. const char *getBZeroEntry() const; - void overrideSchedPolicy(MachineSchedPolicy &Policy, MachineInstr *begin, - MachineInstr *end, + void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override; bool enableEarlyIfConversion() const override; diff --git a/lib/Target/AArch64/AArch64SystemOperands.td b/lib/Target/AArch64/AArch64SystemOperands.td new file mode 100644 index 0000000000000..a3736c0868fb7 --- /dev/null +++ b/lib/Target/AArch64/AArch64SystemOperands.td @@ -0,0 +1,1018 @@ +//===- AArch64SystemOperands.td ----------------------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the symbolic operands permitted for various kinds of +// AArch64 system instruction. +// +//===----------------------------------------------------------------------===// + +include "llvm/TableGen/SearchableTable.td" + +//===----------------------------------------------------------------------===// +// AT (address translate) instruction options. +//===----------------------------------------------------------------------===// + +class AT<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm, + bits<3> op2> : SearchableTable { + let SearchableFields = ["Name", "Encoding"]; + let EnumValueField = "Encoding"; + + string Name = name; + bits<16> Encoding; + let Encoding{15-14} = op0; + let Encoding{13-11} = op1; + let Encoding{10-7} = crn; + let Encoding{6-3} = crm; + let Encoding{2-0} = op2; +} + +def : AT<"S1E1R", 0b01, 0b000, 0b0111, 0b1000, 0b000>; +def : AT<"S1E2R", 0b01, 0b100, 0b0111, 0b1000, 0b000>; +def : AT<"S1E3R", 0b01, 0b110, 0b0111, 0b1000, 0b000>; +def : AT<"S1E1W", 0b01, 0b000, 0b0111, 0b1000, 0b001>; +def : AT<"S1E2W", 0b01, 0b100, 0b0111, 0b1000, 0b001>; +def : AT<"S1E3W", 0b01, 0b110, 0b0111, 0b1000, 0b001>; +def : AT<"S1E0R", 0b01, 0b000, 0b0111, 0b1000, 0b010>; +def : AT<"S1E0W", 0b01, 0b000, 0b0111, 0b1000, 0b011>; +def : AT<"S12E1R", 0b01, 0b100, 0b0111, 0b1000, 0b100>; +def : AT<"S12E1W", 0b01, 0b100, 0b0111, 0b1000, 0b101>; +def : AT<"S12E0R", 0b01, 0b100, 0b0111, 0b1000, 0b110>; +def : AT<"S12E0W", 0b01, 0b100, 0b0111, 0b1000, 0b111>; +def : AT<"S1E1RP", 0b01, 0b000, 0b0111, 0b1001, 0b000>; +def : AT<"S1E1WP", 0b01, 0b000, 0b0111, 0b1001, 0b001>; + + +//===----------------------------------------------------------------------===// +// DMB/DSB (data barrier) instruction options. +//===----------------------------------------------------------------------===// + +class DB<string name, bits<4> encoding> : SearchableTable { + let SearchableFields = ["Name", "Encoding"]; + let EnumValueField = "Encoding"; + + string Name = name; + bits<4> Encoding = encoding; +} + +def : DB<"oshld", 0x1>; +def : DB<"oshst", 0x2>; +def : DB<"osh", 0x3>; +def : DB<"nshld", 0x5>; +def : DB<"nshst", 0x6>; +def : DB<"nsh", 0x7>; +def : DB<"ishld", 0x9>; +def : DB<"ishst", 0xa>; +def : DB<"ish", 0xb>; +def : DB<"ld", 0xd>; +def : DB<"st", 0xe>; +def : DB<"sy", 0xf>; + +//===----------------------------------------------------------------------===// +// DC (data cache maintenance) instruction options. +//===----------------------------------------------------------------------===// + +class DC<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm, + bits<3> op2> : SearchableTable { + let SearchableFields = ["Name", "Encoding"]; + let EnumValueField = "Encoding"; + + string Name = name; + bits<16> Encoding; + let Encoding{15-14} = op0; + let Encoding{13-11} = op1; + let Encoding{10-7} = crn; + let Encoding{6-3} = crm; + let Encoding{2-0} = op2; +} + +def : DC<"ZVA", 0b01, 0b011, 0b0111, 0b0100, 0b001>; +def : DC<"IVAC", 0b01, 0b000, 0b0111, 0b0110, 0b001>; +def : DC<"ISW", 0b01, 0b000, 0b0111, 0b0110, 0b010>; +def : DC<"CVAC", 0b01, 0b011, 0b0111, 0b1010, 0b001>; +def : DC<"CSW", 0b01, 0b000, 0b0111, 0b1010, 0b010>; +def : DC<"CVAU", 0b01, 0b011, 0b0111, 0b1011, 0b001>; +def : DC<"CIVAC", 0b01, 0b011, 0b0111, 0b1110, 0b001>; +def : DC<"CISW", 0b01, 0b000, 0b0111, 0b1110, 0b010>; + +//===----------------------------------------------------------------------===// +// IC (instruction cache maintenance) instruction options. +//===----------------------------------------------------------------------===// + +class IC<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2, + bit needsreg> : SearchableTable { + let SearchableFields = ["Name", "Encoding"]; + let EnumValueField = "Encoding"; + + string Name = name; + bits<14> Encoding; + let Encoding{13-11} = op1; + let Encoding{10-7} = crn; + let Encoding{6-3} = crm; + let Encoding{2-0} = op2; + bit NeedsReg = needsreg; +} + +def : IC<"IALLUIS", 0b000, 0b0111, 0b0001, 0b000, 0>; +def : IC<"IALLU", 0b000, 0b0111, 0b0101, 0b000, 0>; +def : IC<"IVAU", 0b000, 0b0111, 0b0001, 0b000, 1>; + +//===----------------------------------------------------------------------===// +// ISB (instruction-fetch barrier) instruction options. +//===----------------------------------------------------------------------===// + +class ISB<string name, bits<4> encoding> : SearchableTable{ + let SearchableFields = ["Name", "Encoding"]; + let EnumValueField = "Encoding"; + + string Name = name; + bits<4> Encoding; + let Encoding = encoding; +} + +def : ISB<"sy", 0xf>; + +//===----------------------------------------------------------------------===// +// PRFM (prefetch) instruction options. +//===----------------------------------------------------------------------===// + +class PRFM<string name, bits<5> encoding> : SearchableTable { + let SearchableFields = ["Name", "Encoding"]; + let EnumValueField = "Encoding"; + + string Name = name; + bits<5> Encoding; + let Encoding = encoding; +} + +def : PRFM<"pldl1keep", 0x00>; +def : PRFM<"pldl1strm", 0x01>; +def : PRFM<"pldl2keep", 0x02>; +def : PRFM<"pldl2strm", 0x03>; +def : PRFM<"pldl3keep", 0x04>; +def : PRFM<"pldl3strm", 0x05>; +def : PRFM<"plil1keep", 0x08>; +def : PRFM<"plil1strm", 0x09>; +def : PRFM<"plil2keep", 0x0a>; +def : PRFM<"plil2strm", 0x0b>; +def : PRFM<"plil3keep", 0x0c>; +def : PRFM<"plil3strm", 0x0d>; +def : PRFM<"pstl1keep", 0x10>; +def : PRFM<"pstl1strm", 0x11>; +def : PRFM<"pstl2keep", 0x12>; +def : PRFM<"pstl2strm", 0x13>; +def : PRFM<"pstl3keep", 0x14>; +def : PRFM<"pstl3strm", 0x15>; + +//===----------------------------------------------------------------------===// +// PState instruction options. +//===----------------------------------------------------------------------===// + +class PState<string name, bits<5> encoding> : SearchableTable { + let SearchableFields = ["Name", "Encoding"]; + let EnumValueField = "Encoding"; + + string Name = name; + bits<5> Encoding; + let Encoding = encoding; + code Requires = [{ {} }]; +} + +def : PState<"SPSel", 0b00101>; +def : PState<"DAIFSet", 0b11110>; +def : PState<"DAIFClr", 0b11111>; +// v8.1a "Privileged Access Never" extension-specific PStates +let Requires = [{ {AArch64::HasV8_1aOps} }] in +def : PState<"PAN", 0b00100>; +// v8.2a "User Access Override" extension-specific PStates +let Requires = [{ {AArch64::HasV8_2aOps} }] in +def : PState<"UAO", 0b00011>; + + +//===----------------------------------------------------------------------===// +// PSB instruction options. +//===----------------------------------------------------------------------===// + +class PSB<string name, bits<5> encoding> : SearchableTable { + let SearchableFields = ["Name", "Encoding"]; + let EnumValueField = "Encoding"; + + string Name = name; + bits<5> Encoding; + let Encoding = encoding; +} + +def : PSB<"csync", 0x11>; + +//===----------------------------------------------------------------------===// +// TLBI (translation lookaside buffer invalidate) instruction options. +//===----------------------------------------------------------------------===// + +class TLBI<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm, + bits<3> op2, bit needsreg = 1> : SearchableTable { + let SearchableFields = ["Name", "Encoding"]; + let EnumValueField = "Encoding"; + + string Name = name; + bits<16> Encoding; + let Encoding{15-14} = op0; + let Encoding{13-11} = op1; + let Encoding{10-7} = crn; + let Encoding{6-3} = crm; + let Encoding{2-0} = op2; + bit NeedsReg = needsreg; +} + +def : TLBI<"IPAS2E1IS", 0b01, 0b100, 0b1000, 0b0000, 0b001>; +def : TLBI<"IPAS2LE1IS", 0b01, 0b100, 0b1000, 0b0000, 0b101>; +def : TLBI<"VMALLE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b000, 0>; +def : TLBI<"ALLE2IS", 0b01, 0b100, 0b1000, 0b0011, 0b000, 0>; +def : TLBI<"ALLE3IS", 0b01, 0b110, 0b1000, 0b0011, 0b000, 0>; +def : TLBI<"VAE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b001>; +def : TLBI<"VAE2IS", 0b01, 0b100, 0b1000, 0b0011, 0b001>; +def : TLBI<"VAE3IS", 0b01, 0b110, 0b1000, 0b0011, 0b001>; +def : TLBI<"ASIDE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b010>; +def : TLBI<"VAAE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b011>; +def : TLBI<"ALLE1IS", 0b01, 0b100, 0b1000, 0b0011, 0b100, 0>; +def : TLBI<"VALE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b101>; +def : TLBI<"VALE2IS", 0b01, 0b100, 0b1000, 0b0011, 0b101>; +def : TLBI<"VALE3IS", 0b01, 0b110, 0b1000, 0b0011, 0b101>; +def : TLBI<"VMALLS12E1IS", 0b01, 0b100, 0b1000, 0b0011, 0b110, 0>; +def : TLBI<"VAALE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b111>; +def : TLBI<"IPAS2E1", 0b01, 0b100, 0b1000, 0b0100, 0b001>; +def : TLBI<"IPAS2LE1", 0b01, 0b100, 0b1000, 0b0100, 0b101>; +def : TLBI<"VMALLE1", 0b01, 0b000, 0b1000, 0b0111, 0b000, 0>; +def : TLBI<"ALLE2", 0b01, 0b100, 0b1000, 0b0111, 0b000, 0>; +def : TLBI<"ALLE3", 0b01, 0b110, 0b1000, 0b0111, 0b000, 0>; +def : TLBI<"VAE1", 0b01, 0b000, 0b1000, 0b0111, 0b001>; +def : TLBI<"VAE2", 0b01, 0b100, 0b1000, 0b0111, 0b001>; +def : TLBI<"VAE3", 0b01, 0b110, 0b1000, 0b0111, 0b001>; +def : TLBI<"ASIDE1", 0b01, 0b000, 0b1000, 0b0111, 0b010>; +def : TLBI<"VAAE1", 0b01, 0b000, 0b1000, 0b0111, 0b011>; +def : TLBI<"ALLE1", 0b01, 0b100, 0b1000, 0b0111, 0b100, 0>; +def : TLBI<"VALE1", 0b01, 0b000, 0b1000, 0b0111, 0b101>; +def : TLBI<"VALE2", 0b01, 0b100, 0b1000, 0b0111, 0b101>; +def : TLBI<"VALE3", 0b01, 0b110, 0b1000, 0b0111, 0b101>; +def : TLBI<"VMALLS12E1", 0b01, 0b100, 0b1000, 0b0111, 0b110, 0>; +def : TLBI<"VAALE1", 0b01, 0b000, 0b1000, 0b0111, 0b111>; + + +//===----------------------------------------------------------------------===// +// MRS/MSR (system register read/write) instruction options. +//===----------------------------------------------------------------------===// + +class SysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm, + bits<3> op2> : SearchableTable { + let SearchableFields = ["Name", "Encoding"]; + let EnumValueField = "Encoding"; + + string Name = name; + bits<16> Encoding; + let Encoding{15-14} = op0; + let Encoding{13-11} = op1; + let Encoding{10-7} = crn; + let Encoding{6-3} = crm; + let Encoding{2-0} = op2; + bit Readable = ?; + bit Writeable = ?; + code Requires = [{ {} }]; +} + +class RWSysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm, + bits<3> op2> + : SysReg<name, op0, op1, crn, crm, op2> { + let Readable = 1; + let Writeable = 1; +} + +class ROSysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm, + bits<3> op2> + : SysReg<name, op0, op1, crn, crm, op2> { + let Readable = 1; + let Writeable = 0; +} + +class WOSysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm, + bits<3> op2> + : SysReg<name, op0, op1, crn, crm, op2> { + let Readable = 0; + let Writeable = 1; +} + +//===---------------------- +// Read-only regs +//===---------------------- + +// Op0 Op1 CRn CRm Op2 +def : ROSysReg<"MDCCSR_EL0", 0b10, 0b011, 0b0000, 0b0001, 0b000>; +def : ROSysReg<"DBGDTRRX_EL0", 0b10, 0b011, 0b0000, 0b0101, 0b000>; +def : ROSysReg<"MDRAR_EL1", 0b10, 0b000, 0b0001, 0b0000, 0b000>; +def : ROSysReg<"OSLSR_EL1", 0b10, 0b000, 0b0001, 0b0001, 0b100>; +def : ROSysReg<"DBGAUTHSTATUS_EL1", 0b10, 0b000, 0b0111, 0b1110, 0b110>; +def : ROSysReg<"PMCEID0_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b110>; +def : ROSysReg<"PMCEID1_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b111>; +def : ROSysReg<"MIDR_EL1", 0b11, 0b000, 0b0000, 0b0000, 0b000>; +def : ROSysReg<"CCSIDR_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b000>; +def : ROSysReg<"CLIDR_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b001>; +def : ROSysReg<"CTR_EL0", 0b11, 0b011, 0b0000, 0b0000, 0b001>; +def : ROSysReg<"MPIDR_EL1", 0b11, 0b000, 0b0000, 0b0000, 0b101>; +def : ROSysReg<"REVIDR_EL1", 0b11, 0b000, 0b0000, 0b0000, 0b110>; +def : ROSysReg<"AIDR_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b111>; +def : ROSysReg<"DCZID_EL0", 0b11, 0b011, 0b0000, 0b0000, 0b111>; +def : ROSysReg<"ID_PFR0_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b000>; +def : ROSysReg<"ID_PFR1_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b001>; +def : ROSysReg<"ID_DFR0_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b010>; +def : ROSysReg<"ID_AFR0_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b011>; +def : ROSysReg<"ID_MMFR0_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b100>; +def : ROSysReg<"ID_MMFR1_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b101>; +def : ROSysReg<"ID_MMFR2_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b110>; +def : ROSysReg<"ID_MMFR3_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b111>; +def : ROSysReg<"ID_ISAR0_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b000>; +def : ROSysReg<"ID_ISAR1_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b001>; +def : ROSysReg<"ID_ISAR2_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b010>; +def : ROSysReg<"ID_ISAR3_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b011>; +def : ROSysReg<"ID_ISAR4_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b100>; +def : ROSysReg<"ID_ISAR5_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b101>; +def : ROSysReg<"ID_AA64PFR0_EL1", 0b11, 0b000, 0b0000, 0b0100, 0b000>; +def : ROSysReg<"ID_AA64PFR1_EL1", 0b11, 0b000, 0b0000, 0b0100, 0b001>; +def : ROSysReg<"ID_AA64DFR0_EL1", 0b11, 0b000, 0b0000, 0b0101, 0b000>; +def : ROSysReg<"ID_AA64DFR1_EL1", 0b11, 0b000, 0b0000, 0b0101, 0b001>; +def : ROSysReg<"ID_AA64AFR0_EL1", 0b11, 0b000, 0b0000, 0b0101, 0b100>; +def : ROSysReg<"ID_AA64AFR1_EL1", 0b11, 0b000, 0b0000, 0b0101, 0b101>; +def : ROSysReg<"ID_AA64ISAR0_EL1", 0b11, 0b000, 0b0000, 0b0110, 0b000>; +def : ROSysReg<"ID_AA64ISAR1_EL1", 0b11, 0b000, 0b0000, 0b0110, 0b001>; +def : ROSysReg<"ID_AA64MMFR0_EL1", 0b11, 0b000, 0b0000, 0b0111, 0b000>; +def : ROSysReg<"ID_AA64MMFR1_EL1", 0b11, 0b000, 0b0000, 0b0111, 0b001>; +def : ROSysReg<"ID_AA64MMFR2_EL1", 0b11, 0b000, 0b0000, 0b0111, 0b010> { + let Requires = [{ {AArch64::HasV8_2aOps} }]; +} +def : ROSysReg<"MVFR0_EL1", 0b11, 0b000, 0b0000, 0b0011, 0b000>; +def : ROSysReg<"MVFR1_EL1", 0b11, 0b000, 0b0000, 0b0011, 0b001>; +def : ROSysReg<"MVFR2_EL1", 0b11, 0b000, 0b0000, 0b0011, 0b010>; +def : ROSysReg<"RVBAR_EL1", 0b11, 0b000, 0b1100, 0b0000, 0b001>; +def : ROSysReg<"RVBAR_EL2", 0b11, 0b100, 0b1100, 0b0000, 0b001>; +def : ROSysReg<"RVBAR_EL3", 0b11, 0b110, 0b1100, 0b0000, 0b001>; +def : ROSysReg<"ISR_EL1", 0b11, 0b000, 0b1100, 0b0001, 0b000>; +def : ROSysReg<"CNTPCT_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b001>; +def : ROSysReg<"CNTVCT_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b010>; +def : ROSysReg<"ID_MMFR4_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b110>; + +// Trace registers +// Op0 Op1 CRn CRm Op2 +def : ROSysReg<"TRCSTATR", 0b10, 0b001, 0b0000, 0b0011, 0b000>; +def : ROSysReg<"TRCIDR8", 0b10, 0b001, 0b0000, 0b0000, 0b110>; +def : ROSysReg<"TRCIDR9", 0b10, 0b001, 0b0000, 0b0001, 0b110>; +def : ROSysReg<"TRCIDR10", 0b10, 0b001, 0b0000, 0b0010, 0b110>; +def : ROSysReg<"TRCIDR11", 0b10, 0b001, 0b0000, 0b0011, 0b110>; +def : ROSysReg<"TRCIDR12", 0b10, 0b001, 0b0000, 0b0100, 0b110>; +def : ROSysReg<"TRCIDR13", 0b10, 0b001, 0b0000, 0b0101, 0b110>; +def : ROSysReg<"TRCIDR0", 0b10, 0b001, 0b0000, 0b1000, 0b111>; +def : ROSysReg<"TRCIDR1", 0b10, 0b001, 0b0000, 0b1001, 0b111>; +def : ROSysReg<"TRCIDR2", 0b10, 0b001, 0b0000, 0b1010, 0b111>; +def : ROSysReg<"TRCIDR3", 0b10, 0b001, 0b0000, 0b1011, 0b111>; +def : ROSysReg<"TRCIDR4", 0b10, 0b001, 0b0000, 0b1100, 0b111>; +def : ROSysReg<"TRCIDR5", 0b10, 0b001, 0b0000, 0b1101, 0b111>; +def : ROSysReg<"TRCIDR6", 0b10, 0b001, 0b0000, 0b1110, 0b111>; +def : ROSysReg<"TRCIDR7", 0b10, 0b001, 0b0000, 0b1111, 0b111>; +def : ROSysReg<"TRCOSLSR", 0b10, 0b001, 0b0001, 0b0001, 0b100>; +def : ROSysReg<"TRCPDSR", 0b10, 0b001, 0b0001, 0b0101, 0b100>; +def : ROSysReg<"TRCDEVAFF0", 0b10, 0b001, 0b0111, 0b1010, 0b110>; +def : ROSysReg<"TRCDEVAFF1", 0b10, 0b001, 0b0111, 0b1011, 0b110>; +def : ROSysReg<"TRCLSR", 0b10, 0b001, 0b0111, 0b1101, 0b110>; +def : ROSysReg<"TRCAUTHSTATUS", 0b10, 0b001, 0b0111, 0b1110, 0b110>; +def : ROSysReg<"TRCDEVARCH", 0b10, 0b001, 0b0111, 0b1111, 0b110>; +def : ROSysReg<"TRCDEVID", 0b10, 0b001, 0b0111, 0b0010, 0b111>; +def : ROSysReg<"TRCDEVTYPE", 0b10, 0b001, 0b0111, 0b0011, 0b111>; +def : ROSysReg<"TRCPIDR4", 0b10, 0b001, 0b0111, 0b0100, 0b111>; +def : ROSysReg<"TRCPIDR5", 0b10, 0b001, 0b0111, 0b0101, 0b111>; +def : ROSysReg<"TRCPIDR6", 0b10, 0b001, 0b0111, 0b0110, 0b111>; +def : ROSysReg<"TRCPIDR7", 0b10, 0b001, 0b0111, 0b0111, 0b111>; +def : ROSysReg<"TRCPIDR0", 0b10, 0b001, 0b0111, 0b1000, 0b111>; +def : ROSysReg<"TRCPIDR1", 0b10, 0b001, 0b0111, 0b1001, 0b111>; +def : ROSysReg<"TRCPIDR2", 0b10, 0b001, 0b0111, 0b1010, 0b111>; +def : ROSysReg<"TRCPIDR3", 0b10, 0b001, 0b0111, 0b1011, 0b111>; +def : ROSysReg<"TRCCIDR0", 0b10, 0b001, 0b0111, 0b1100, 0b111>; +def : ROSysReg<"TRCCIDR1", 0b10, 0b001, 0b0111, 0b1101, 0b111>; +def : ROSysReg<"TRCCIDR2", 0b10, 0b001, 0b0111, 0b1110, 0b111>; +def : ROSysReg<"TRCCIDR3", 0b10, 0b001, 0b0111, 0b1111, 0b111>; + +// GICv3 registers +// Op0 Op1 CRn CRm Op2 +def : ROSysReg<"ICC_IAR1_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b000>; +def : ROSysReg<"ICC_IAR0_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b000>; +def : ROSysReg<"ICC_HPPIR1_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b010>; +def : ROSysReg<"ICC_HPPIR0_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b010>; +def : ROSysReg<"ICC_RPR_EL1", 0b11, 0b000, 0b1100, 0b1011, 0b011>; +def : ROSysReg<"ICH_VTR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b001>; +def : ROSysReg<"ICH_EISR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b011>; +def : ROSysReg<"ICH_ELSR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b101>; + +// v8.1a "Limited Ordering Regions" extension-specific system register +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::HasV8_1aOps} }] in +def : ROSysReg<"LORID_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b111>; + +// v8.2a "RAS extension" registers +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureRAS} }] in { +def : ROSysReg<"ERRIDR_EL1", 0b11, 0b000, 0b0101, 0b0011, 0b000>; +def : ROSysReg<"ERXFR_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b000>; +} + +//===---------------------- +// Write-only regs +//===---------------------- + +// Op0 Op1 CRn CRm Op2 +def : WOSysReg<"DBGDTRTX_EL0", 0b10, 0b011, 0b0000, 0b0101, 0b000>; +def : WOSysReg<"OSLAR_EL1", 0b10, 0b000, 0b0001, 0b0000, 0b100>; +def : WOSysReg<"PMSWINC_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b100>; + +// Trace Registers +// Op0 Op1 CRn CRm Op2 +def : WOSysReg<"TRCOSLAR", 0b10, 0b001, 0b0001, 0b0000, 0b100>; +def : WOSysReg<"TRCLAR", 0b10, 0b001, 0b0111, 0b1100, 0b110>; + +// GICv3 registers +// Op0 Op1 CRn CRm Op2 +def : WOSysReg<"ICC_EOIR1_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b001>; +def : WOSysReg<"ICC_EOIR0_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b001>; +def : WOSysReg<"ICC_DIR_EL1", 0b11, 0b000, 0b1100, 0b1011, 0b001>; +def : WOSysReg<"ICC_SGI1R_EL1", 0b11, 0b000, 0b1100, 0b1011, 0b101>; +def : WOSysReg<"ICC_ASGI1R_EL1", 0b11, 0b000, 0b1100, 0b1011, 0b110>; +def : WOSysReg<"ICC_SGI0R_EL1", 0b11, 0b000, 0b1100, 0b1011, 0b111>; + +//===---------------------- +// Read-write regs +//===---------------------- + +// Op0 Op1 CRn CRm Op2 +def : RWSysReg<"OSDTRRX_EL1", 0b10, 0b000, 0b0000, 0b0000, 0b010>; +def : RWSysReg<"OSDTRTX_EL1", 0b10, 0b000, 0b0000, 0b0011, 0b010>; +def : RWSysReg<"TEECR32_EL1", 0b10, 0b010, 0b0000, 0b0000, 0b000>; +def : RWSysReg<"MDCCINT_EL1", 0b10, 0b000, 0b0000, 0b0010, 0b000>; +def : RWSysReg<"MDSCR_EL1", 0b10, 0b000, 0b0000, 0b0010, 0b010>; +def : RWSysReg<"DBGDTR_EL0", 0b10, 0b011, 0b0000, 0b0100, 0b000>; +def : RWSysReg<"OSECCR_EL1", 0b10, 0b000, 0b0000, 0b0110, 0b010>; +def : RWSysReg<"DBGVCR32_EL2", 0b10, 0b100, 0b0000, 0b0111, 0b000>; +def : RWSysReg<"DBGBVR0_EL1", 0b10, 0b000, 0b0000, 0b0000, 0b100>; +def : RWSysReg<"DBGBVR1_EL1", 0b10, 0b000, 0b0000, 0b0001, 0b100>; +def : RWSysReg<"DBGBVR2_EL1", 0b10, 0b000, 0b0000, 0b0010, 0b100>; +def : RWSysReg<"DBGBVR3_EL1", 0b10, 0b000, 0b0000, 0b0011, 0b100>; +def : RWSysReg<"DBGBVR4_EL1", 0b10, 0b000, 0b0000, 0b0100, 0b100>; +def : RWSysReg<"DBGBVR5_EL1", 0b10, 0b000, 0b0000, 0b0101, 0b100>; +def : RWSysReg<"DBGBVR6_EL1", 0b10, 0b000, 0b0000, 0b0110, 0b100>; +def : RWSysReg<"DBGBVR7_EL1", 0b10, 0b000, 0b0000, 0b0111, 0b100>; +def : RWSysReg<"DBGBVR8_EL1", 0b10, 0b000, 0b0000, 0b1000, 0b100>; +def : RWSysReg<"DBGBVR9_EL1", 0b10, 0b000, 0b0000, 0b1001, 0b100>; +def : RWSysReg<"DBGBVR10_EL1", 0b10, 0b000, 0b0000, 0b1010, 0b100>; +def : RWSysReg<"DBGBVR11_EL1", 0b10, 0b000, 0b0000, 0b1011, 0b100>; +def : RWSysReg<"DBGBVR12_EL1", 0b10, 0b000, 0b0000, 0b1100, 0b100>; +def : RWSysReg<"DBGBVR13_EL1", 0b10, 0b000, 0b0000, 0b1101, 0b100>; +def : RWSysReg<"DBGBVR14_EL1", 0b10, 0b000, 0b0000, 0b1110, 0b100>; +def : RWSysReg<"DBGBVR15_EL1", 0b10, 0b000, 0b0000, 0b1111, 0b100>; +def : RWSysReg<"DBGBCR0_EL1", 0b10, 0b000, 0b0000, 0b0000, 0b101>; +def : RWSysReg<"DBGBCR1_EL1", 0b10, 0b000, 0b0000, 0b0001, 0b101>; +def : RWSysReg<"DBGBCR2_EL1", 0b10, 0b000, 0b0000, 0b0010, 0b101>; +def : RWSysReg<"DBGBCR3_EL1", 0b10, 0b000, 0b0000, 0b0011, 0b101>; +def : RWSysReg<"DBGBCR4_EL1", 0b10, 0b000, 0b0000, 0b0100, 0b101>; +def : RWSysReg<"DBGBCR5_EL1", 0b10, 0b000, 0b0000, 0b0101, 0b101>; +def : RWSysReg<"DBGBCR6_EL1", 0b10, 0b000, 0b0000, 0b0110, 0b101>; +def : RWSysReg<"DBGBCR7_EL1", 0b10, 0b000, 0b0000, 0b0111, 0b101>; +def : RWSysReg<"DBGBCR8_EL1", 0b10, 0b000, 0b0000, 0b1000, 0b101>; +def : RWSysReg<"DBGBCR9_EL1", 0b10, 0b000, 0b0000, 0b1001, 0b101>; +def : RWSysReg<"DBGBCR10_EL1", 0b10, 0b000, 0b0000, 0b1010, 0b101>; +def : RWSysReg<"DBGBCR11_EL1", 0b10, 0b000, 0b0000, 0b1011, 0b101>; +def : RWSysReg<"DBGBCR12_EL1", 0b10, 0b000, 0b0000, 0b1100, 0b101>; +def : RWSysReg<"DBGBCR13_EL1", 0b10, 0b000, 0b0000, 0b1101, 0b101>; +def : RWSysReg<"DBGBCR14_EL1", 0b10, 0b000, 0b0000, 0b1110, 0b101>; +def : RWSysReg<"DBGBCR15_EL1", 0b10, 0b000, 0b0000, 0b1111, 0b101>; +def : RWSysReg<"DBGWVR0_EL1", 0b10, 0b000, 0b0000, 0b0000, 0b110>; +def : RWSysReg<"DBGWVR1_EL1", 0b10, 0b000, 0b0000, 0b0001, 0b110>; +def : RWSysReg<"DBGWVR2_EL1", 0b10, 0b000, 0b0000, 0b0010, 0b110>; +def : RWSysReg<"DBGWVR3_EL1", 0b10, 0b000, 0b0000, 0b0011, 0b110>; +def : RWSysReg<"DBGWVR4_EL1", 0b10, 0b000, 0b0000, 0b0100, 0b110>; +def : RWSysReg<"DBGWVR5_EL1", 0b10, 0b000, 0b0000, 0b0101, 0b110>; +def : RWSysReg<"DBGWVR6_EL1", 0b10, 0b000, 0b0000, 0b0110, 0b110>; +def : RWSysReg<"DBGWVR7_EL1", 0b10, 0b000, 0b0000, 0b0111, 0b110>; +def : RWSysReg<"DBGWVR8_EL1", 0b10, 0b000, 0b0000, 0b1000, 0b110>; +def : RWSysReg<"DBGWVR9_EL1", 0b10, 0b000, 0b0000, 0b1001, 0b110>; +def : RWSysReg<"DBGWVR10_EL1", 0b10, 0b000, 0b0000, 0b1010, 0b110>; +def : RWSysReg<"DBGWVR11_EL1", 0b10, 0b000, 0b0000, 0b1011, 0b110>; +def : RWSysReg<"DBGWVR12_EL1", 0b10, 0b000, 0b0000, 0b1100, 0b110>; +def : RWSysReg<"DBGWVR13_EL1", 0b10, 0b000, 0b0000, 0b1101, 0b110>; +def : RWSysReg<"DBGWVR14_EL1", 0b10, 0b000, 0b0000, 0b1110, 0b110>; +def : RWSysReg<"DBGWVR15_EL1", 0b10, 0b000, 0b0000, 0b1111, 0b110>; +def : RWSysReg<"DBGWCR0_EL1", 0b10, 0b000, 0b0000, 0b0000, 0b111>; +def : RWSysReg<"DBGWCR1_EL1", 0b10, 0b000, 0b0000, 0b0001, 0b111>; +def : RWSysReg<"DBGWCR2_EL1", 0b10, 0b000, 0b0000, 0b0010, 0b111>; +def : RWSysReg<"DBGWCR3_EL1", 0b10, 0b000, 0b0000, 0b0011, 0b111>; +def : RWSysReg<"DBGWCR4_EL1", 0b10, 0b000, 0b0000, 0b0100, 0b111>; +def : RWSysReg<"DBGWCR5_EL1", 0b10, 0b000, 0b0000, 0b0101, 0b111>; +def : RWSysReg<"DBGWCR6_EL1", 0b10, 0b000, 0b0000, 0b0110, 0b111>; +def : RWSysReg<"DBGWCR7_EL1", 0b10, 0b000, 0b0000, 0b0111, 0b111>; +def : RWSysReg<"DBGWCR8_EL1", 0b10, 0b000, 0b0000, 0b1000, 0b111>; +def : RWSysReg<"DBGWCR9_EL1", 0b10, 0b000, 0b0000, 0b1001, 0b111>; +def : RWSysReg<"DBGWCR10_EL1", 0b10, 0b000, 0b0000, 0b1010, 0b111>; +def : RWSysReg<"DBGWCR11_EL1", 0b10, 0b000, 0b0000, 0b1011, 0b111>; +def : RWSysReg<"DBGWCR12_EL1", 0b10, 0b000, 0b0000, 0b1100, 0b111>; +def : RWSysReg<"DBGWCR13_EL1", 0b10, 0b000, 0b0000, 0b1101, 0b111>; +def : RWSysReg<"DBGWCR14_EL1", 0b10, 0b000, 0b0000, 0b1110, 0b111>; +def : RWSysReg<"DBGWCR15_EL1", 0b10, 0b000, 0b0000, 0b1111, 0b111>; +def : RWSysReg<"TEEHBR32_EL1", 0b10, 0b010, 0b0001, 0b0000, 0b000>; +def : RWSysReg<"OSDLR_EL1", 0b10, 0b000, 0b0001, 0b0011, 0b100>; +def : RWSysReg<"DBGPRCR_EL1", 0b10, 0b000, 0b0001, 0b0100, 0b100>; +def : RWSysReg<"DBGCLAIMSET_EL1", 0b10, 0b000, 0b0111, 0b1000, 0b110>; +def : RWSysReg<"DBGCLAIMCLR_EL1", 0b10, 0b000, 0b0111, 0b1001, 0b110>; +def : RWSysReg<"CSSELR_EL1", 0b11, 0b010, 0b0000, 0b0000, 0b000>; +def : RWSysReg<"VPIDR_EL2", 0b11, 0b100, 0b0000, 0b0000, 0b000>; +def : RWSysReg<"VMPIDR_EL2", 0b11, 0b100, 0b0000, 0b0000, 0b101>; +def : RWSysReg<"CPACR_EL1", 0b11, 0b000, 0b0001, 0b0000, 0b010>; +def : RWSysReg<"SCTLR_EL1", 0b11, 0b000, 0b0001, 0b0000, 0b000>; +def : RWSysReg<"SCTLR_EL2", 0b11, 0b100, 0b0001, 0b0000, 0b000>; +def : RWSysReg<"SCTLR_EL3", 0b11, 0b110, 0b0001, 0b0000, 0b000>; +def : RWSysReg<"ACTLR_EL1", 0b11, 0b000, 0b0001, 0b0000, 0b001>; +def : RWSysReg<"ACTLR_EL2", 0b11, 0b100, 0b0001, 0b0000, 0b001>; +def : RWSysReg<"ACTLR_EL3", 0b11, 0b110, 0b0001, 0b0000, 0b001>; +def : RWSysReg<"HCR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b000>; +def : RWSysReg<"SCR_EL3", 0b11, 0b110, 0b0001, 0b0001, 0b000>; +def : RWSysReg<"MDCR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b001>; +def : RWSysReg<"SDER32_EL3", 0b11, 0b110, 0b0001, 0b0001, 0b001>; +def : RWSysReg<"CPTR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b010>; +def : RWSysReg<"CPTR_EL3", 0b11, 0b110, 0b0001, 0b0001, 0b010>; +def : RWSysReg<"HSTR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b011>; +def : RWSysReg<"HACR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b111>; +def : RWSysReg<"MDCR_EL3", 0b11, 0b110, 0b0001, 0b0011, 0b001>; +def : RWSysReg<"TTBR0_EL1", 0b11, 0b000, 0b0010, 0b0000, 0b000>; +def : RWSysReg<"TTBR0_EL2", 0b11, 0b100, 0b0010, 0b0000, 0b000>; +def : RWSysReg<"TTBR0_EL3", 0b11, 0b110, 0b0010, 0b0000, 0b000>; +def : RWSysReg<"TTBR1_EL1", 0b11, 0b000, 0b0010, 0b0000, 0b001>; +def : RWSysReg<"TCR_EL1", 0b11, 0b000, 0b0010, 0b0000, 0b010>; +def : RWSysReg<"TCR_EL2", 0b11, 0b100, 0b0010, 0b0000, 0b010>; +def : RWSysReg<"TCR_EL3", 0b11, 0b110, 0b0010, 0b0000, 0b010>; +def : RWSysReg<"VTTBR_EL2", 0b11, 0b100, 0b0010, 0b0001, 0b000>; +def : RWSysReg<"VTCR_EL2", 0b11, 0b100, 0b0010, 0b0001, 0b010>; +def : RWSysReg<"DACR32_EL2", 0b11, 0b100, 0b0011, 0b0000, 0b000>; +def : RWSysReg<"SPSR_EL1", 0b11, 0b000, 0b0100, 0b0000, 0b000>; +def : RWSysReg<"SPSR_EL2", 0b11, 0b100, 0b0100, 0b0000, 0b000>; +def : RWSysReg<"SPSR_EL3", 0b11, 0b110, 0b0100, 0b0000, 0b000>; +def : RWSysReg<"ELR_EL1", 0b11, 0b000, 0b0100, 0b0000, 0b001>; +def : RWSysReg<"ELR_EL2", 0b11, 0b100, 0b0100, 0b0000, 0b001>; +def : RWSysReg<"ELR_EL3", 0b11, 0b110, 0b0100, 0b0000, 0b001>; +def : RWSysReg<"SP_EL0", 0b11, 0b000, 0b0100, 0b0001, 0b000>; +def : RWSysReg<"SP_EL1", 0b11, 0b100, 0b0100, 0b0001, 0b000>; +def : RWSysReg<"SP_EL2", 0b11, 0b110, 0b0100, 0b0001, 0b000>; +def : RWSysReg<"SPSel", 0b11, 0b000, 0b0100, 0b0010, 0b000>; +def : RWSysReg<"NZCV", 0b11, 0b011, 0b0100, 0b0010, 0b000>; +def : RWSysReg<"DAIF", 0b11, 0b011, 0b0100, 0b0010, 0b001>; +def : RWSysReg<"CurrentEL", 0b11, 0b000, 0b0100, 0b0010, 0b010>; +def : RWSysReg<"SPSR_irq", 0b11, 0b100, 0b0100, 0b0011, 0b000>; +def : RWSysReg<"SPSR_abt", 0b11, 0b100, 0b0100, 0b0011, 0b001>; +def : RWSysReg<"SPSR_und", 0b11, 0b100, 0b0100, 0b0011, 0b010>; +def : RWSysReg<"SPSR_fiq", 0b11, 0b100, 0b0100, 0b0011, 0b011>; +def : RWSysReg<"FPCR", 0b11, 0b011, 0b0100, 0b0100, 0b000>; +def : RWSysReg<"FPSR", 0b11, 0b011, 0b0100, 0b0100, 0b001>; +def : RWSysReg<"DSPSR_EL0", 0b11, 0b011, 0b0100, 0b0101, 0b000>; +def : RWSysReg<"DLR_EL0", 0b11, 0b011, 0b0100, 0b0101, 0b001>; +def : RWSysReg<"IFSR32_EL2", 0b11, 0b100, 0b0101, 0b0000, 0b001>; +def : RWSysReg<"AFSR0_EL1", 0b11, 0b000, 0b0101, 0b0001, 0b000>; +def : RWSysReg<"AFSR0_EL2", 0b11, 0b100, 0b0101, 0b0001, 0b000>; +def : RWSysReg<"AFSR0_EL3", 0b11, 0b110, 0b0101, 0b0001, 0b000>; +def : RWSysReg<"AFSR1_EL1", 0b11, 0b000, 0b0101, 0b0001, 0b001>; +def : RWSysReg<"AFSR1_EL2", 0b11, 0b100, 0b0101, 0b0001, 0b001>; +def : RWSysReg<"AFSR1_EL3", 0b11, 0b110, 0b0101, 0b0001, 0b001>; +def : RWSysReg<"ESR_EL1", 0b11, 0b000, 0b0101, 0b0010, 0b000>; +def : RWSysReg<"ESR_EL2", 0b11, 0b100, 0b0101, 0b0010, 0b000>; +def : RWSysReg<"ESR_EL3", 0b11, 0b110, 0b0101, 0b0010, 0b000>; +def : RWSysReg<"FPEXC32_EL2", 0b11, 0b100, 0b0101, 0b0011, 0b000>; +def : RWSysReg<"FAR_EL1", 0b11, 0b000, 0b0110, 0b0000, 0b000>; +def : RWSysReg<"FAR_EL2", 0b11, 0b100, 0b0110, 0b0000, 0b000>; +def : RWSysReg<"FAR_EL3", 0b11, 0b110, 0b0110, 0b0000, 0b000>; +def : RWSysReg<"HPFAR_EL2", 0b11, 0b100, 0b0110, 0b0000, 0b100>; +def : RWSysReg<"PAR_EL1", 0b11, 0b000, 0b0111, 0b0100, 0b000>; +def : RWSysReg<"PMCR_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b000>; +def : RWSysReg<"PMCNTENSET_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b001>; +def : RWSysReg<"PMCNTENCLR_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b010>; +def : RWSysReg<"PMOVSCLR_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b011>; +def : RWSysReg<"PMSELR_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b101>; +def : RWSysReg<"PMCCNTR_EL0", 0b11, 0b011, 0b1001, 0b1101, 0b000>; +def : RWSysReg<"PMXEVTYPER_EL0", 0b11, 0b011, 0b1001, 0b1101, 0b001>; +def : RWSysReg<"PMXEVCNTR_EL0", 0b11, 0b011, 0b1001, 0b1101, 0b010>; +def : RWSysReg<"PMUSERENR_EL0", 0b11, 0b011, 0b1001, 0b1110, 0b000>; +def : RWSysReg<"PMINTENSET_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b001>; +def : RWSysReg<"PMINTENCLR_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b010>; +def : RWSysReg<"PMOVSSET_EL0", 0b11, 0b011, 0b1001, 0b1110, 0b011>; +def : RWSysReg<"MAIR_EL1", 0b11, 0b000, 0b1010, 0b0010, 0b000>; +def : RWSysReg<"MAIR_EL2", 0b11, 0b100, 0b1010, 0b0010, 0b000>; +def : RWSysReg<"MAIR_EL3", 0b11, 0b110, 0b1010, 0b0010, 0b000>; +def : RWSysReg<"AMAIR_EL1", 0b11, 0b000, 0b1010, 0b0011, 0b000>; +def : RWSysReg<"AMAIR_EL2", 0b11, 0b100, 0b1010, 0b0011, 0b000>; +def : RWSysReg<"AMAIR_EL3", 0b11, 0b110, 0b1010, 0b0011, 0b000>; +def : RWSysReg<"VBAR_EL1", 0b11, 0b000, 0b1100, 0b0000, 0b000>; +def : RWSysReg<"VBAR_EL2", 0b11, 0b100, 0b1100, 0b0000, 0b000>; +def : RWSysReg<"VBAR_EL3", 0b11, 0b110, 0b1100, 0b0000, 0b000>; +def : RWSysReg<"RMR_EL1", 0b11, 0b000, 0b1100, 0b0000, 0b010>; +def : RWSysReg<"RMR_EL2", 0b11, 0b100, 0b1100, 0b0000, 0b010>; +def : RWSysReg<"RMR_EL3", 0b11, 0b110, 0b1100, 0b0000, 0b010>; +def : RWSysReg<"CONTEXTIDR_EL1", 0b11, 0b000, 0b1101, 0b0000, 0b001>; +def : RWSysReg<"TPIDR_EL0", 0b11, 0b011, 0b1101, 0b0000, 0b010>; +def : RWSysReg<"TPIDR_EL2", 0b11, 0b100, 0b1101, 0b0000, 0b010>; +def : RWSysReg<"TPIDR_EL3", 0b11, 0b110, 0b1101, 0b0000, 0b010>; +def : RWSysReg<"TPIDRRO_EL0", 0b11, 0b011, 0b1101, 0b0000, 0b011>; +def : RWSysReg<"TPIDR_EL1", 0b11, 0b000, 0b1101, 0b0000, 0b100>; +def : RWSysReg<"CNTFRQ_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b000>; +def : RWSysReg<"CNTVOFF_EL2", 0b11, 0b100, 0b1110, 0b0000, 0b011>; +def : RWSysReg<"CNTKCTL_EL1", 0b11, 0b000, 0b1110, 0b0001, 0b000>; +def : RWSysReg<"CNTHCTL_EL2", 0b11, 0b100, 0b1110, 0b0001, 0b000>; +def : RWSysReg<"CNTP_TVAL_EL0", 0b11, 0b011, 0b1110, 0b0010, 0b000>; +def : RWSysReg<"CNTHP_TVAL_EL2", 0b11, 0b100, 0b1110, 0b0010, 0b000>; +def : RWSysReg<"CNTPS_TVAL_EL1", 0b11, 0b111, 0b1110, 0b0010, 0b000>; +def : RWSysReg<"CNTP_CTL_EL0", 0b11, 0b011, 0b1110, 0b0010, 0b001>; +def : RWSysReg<"CNTHP_CTL_EL2", 0b11, 0b100, 0b1110, 0b0010, 0b001>; +def : RWSysReg<"CNTPS_CTL_EL1", 0b11, 0b111, 0b1110, 0b0010, 0b001>; +def : RWSysReg<"CNTP_CVAL_EL0", 0b11, 0b011, 0b1110, 0b0010, 0b010>; +def : RWSysReg<"CNTHP_CVAL_EL2", 0b11, 0b100, 0b1110, 0b0010, 0b010>; +def : RWSysReg<"CNTPS_CVAL_EL1", 0b11, 0b111, 0b1110, 0b0010, 0b010>; +def : RWSysReg<"CNTV_TVAL_EL0", 0b11, 0b011, 0b1110, 0b0011, 0b000>; +def : RWSysReg<"CNTV_CTL_EL0", 0b11, 0b011, 0b1110, 0b0011, 0b001>; +def : RWSysReg<"CNTV_CVAL_EL0", 0b11, 0b011, 0b1110, 0b0011, 0b010>; +def : RWSysReg<"PMEVCNTR0_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b000>; +def : RWSysReg<"PMEVCNTR1_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b001>; +def : RWSysReg<"PMEVCNTR2_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b010>; +def : RWSysReg<"PMEVCNTR3_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b011>; +def : RWSysReg<"PMEVCNTR4_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b100>; +def : RWSysReg<"PMEVCNTR5_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b101>; +def : RWSysReg<"PMEVCNTR6_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b110>; +def : RWSysReg<"PMEVCNTR7_EL0", 0b11, 0b011, 0b1110, 0b1000, 0b111>; +def : RWSysReg<"PMEVCNTR8_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b000>; +def : RWSysReg<"PMEVCNTR9_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b001>; +def : RWSysReg<"PMEVCNTR10_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b010>; +def : RWSysReg<"PMEVCNTR11_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b011>; +def : RWSysReg<"PMEVCNTR12_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b100>; +def : RWSysReg<"PMEVCNTR13_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b101>; +def : RWSysReg<"PMEVCNTR14_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b110>; +def : RWSysReg<"PMEVCNTR15_EL0", 0b11, 0b011, 0b1110, 0b1001, 0b111>; +def : RWSysReg<"PMEVCNTR16_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b000>; +def : RWSysReg<"PMEVCNTR17_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b001>; +def : RWSysReg<"PMEVCNTR18_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b010>; +def : RWSysReg<"PMEVCNTR19_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b011>; +def : RWSysReg<"PMEVCNTR20_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b100>; +def : RWSysReg<"PMEVCNTR21_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b101>; +def : RWSysReg<"PMEVCNTR22_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b110>; +def : RWSysReg<"PMEVCNTR23_EL0", 0b11, 0b011, 0b1110, 0b1010, 0b111>; +def : RWSysReg<"PMEVCNTR24_EL0", 0b11, 0b011, 0b1110, 0b1011, 0b000>; +def : RWSysReg<"PMEVCNTR25_EL0", 0b11, 0b011, 0b1110, 0b1011, 0b001>; +def : RWSysReg<"PMEVCNTR26_EL0", 0b11, 0b011, 0b1110, 0b1011, 0b010>; +def : RWSysReg<"PMEVCNTR27_EL0", 0b11, 0b011, 0b1110, 0b1011, 0b011>; +def : RWSysReg<"PMEVCNTR28_EL0", 0b11, 0b011, 0b1110, 0b1011, 0b100>; +def : RWSysReg<"PMEVCNTR29_EL0", 0b11, 0b011, 0b1110, 0b1011, 0b101>; +def : RWSysReg<"PMEVCNTR30_EL0", 0b11, 0b011, 0b1110, 0b1011, 0b110>; +def : RWSysReg<"PMCCFILTR_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b111>; +def : RWSysReg<"PMEVTYPER0_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b000>; +def : RWSysReg<"PMEVTYPER1_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b001>; +def : RWSysReg<"PMEVTYPER2_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b010>; +def : RWSysReg<"PMEVTYPER3_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b011>; +def : RWSysReg<"PMEVTYPER4_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b100>; +def : RWSysReg<"PMEVTYPER5_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b101>; +def : RWSysReg<"PMEVTYPER6_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b110>; +def : RWSysReg<"PMEVTYPER7_EL0", 0b11, 0b011, 0b1110, 0b1100, 0b111>; +def : RWSysReg<"PMEVTYPER8_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b000>; +def : RWSysReg<"PMEVTYPER9_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b001>; +def : RWSysReg<"PMEVTYPER10_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b010>; +def : RWSysReg<"PMEVTYPER11_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b011>; +def : RWSysReg<"PMEVTYPER12_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b100>; +def : RWSysReg<"PMEVTYPER13_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b101>; +def : RWSysReg<"PMEVTYPER14_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b110>; +def : RWSysReg<"PMEVTYPER15_EL0", 0b11, 0b011, 0b1110, 0b1101, 0b111>; +def : RWSysReg<"PMEVTYPER16_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b000>; +def : RWSysReg<"PMEVTYPER17_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b001>; +def : RWSysReg<"PMEVTYPER18_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b010>; +def : RWSysReg<"PMEVTYPER19_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b011>; +def : RWSysReg<"PMEVTYPER20_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b100>; +def : RWSysReg<"PMEVTYPER21_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b101>; +def : RWSysReg<"PMEVTYPER22_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b110>; +def : RWSysReg<"PMEVTYPER23_EL0", 0b11, 0b011, 0b1110, 0b1110, 0b111>; +def : RWSysReg<"PMEVTYPER24_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b000>; +def : RWSysReg<"PMEVTYPER25_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b001>; +def : RWSysReg<"PMEVTYPER26_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b010>; +def : RWSysReg<"PMEVTYPER27_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b011>; +def : RWSysReg<"PMEVTYPER28_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b100>; +def : RWSysReg<"PMEVTYPER29_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b101>; +def : RWSysReg<"PMEVTYPER30_EL0", 0b11, 0b011, 0b1110, 0b1111, 0b110>; + +// Trace registers +// Op0 Op1 CRn CRm Op2 +def : RWSysReg<"TRCPRGCTLR", 0b10, 0b001, 0b0000, 0b0001, 0b000>; +def : RWSysReg<"TRCPROCSELR", 0b10, 0b001, 0b0000, 0b0010, 0b000>; +def : RWSysReg<"TRCCONFIGR", 0b10, 0b001, 0b0000, 0b0100, 0b000>; +def : RWSysReg<"TRCAUXCTLR", 0b10, 0b001, 0b0000, 0b0110, 0b000>; +def : RWSysReg<"TRCEVENTCTL0R", 0b10, 0b001, 0b0000, 0b1000, 0b000>; +def : RWSysReg<"TRCEVENTCTL1R", 0b10, 0b001, 0b0000, 0b1001, 0b000>; +def : RWSysReg<"TRCSTALLCTLR", 0b10, 0b001, 0b0000, 0b1011, 0b000>; +def : RWSysReg<"TRCTSCTLR", 0b10, 0b001, 0b0000, 0b1100, 0b000>; +def : RWSysReg<"TRCSYNCPR", 0b10, 0b001, 0b0000, 0b1101, 0b000>; +def : RWSysReg<"TRCCCCTLR", 0b10, 0b001, 0b0000, 0b1110, 0b000>; +def : RWSysReg<"TRCBBCTLR", 0b10, 0b001, 0b0000, 0b1111, 0b000>; +def : RWSysReg<"TRCTRACEIDR", 0b10, 0b001, 0b0000, 0b0000, 0b001>; +def : RWSysReg<"TRCQCTLR", 0b10, 0b001, 0b0000, 0b0001, 0b001>; +def : RWSysReg<"TRCVICTLR", 0b10, 0b001, 0b0000, 0b0000, 0b010>; +def : RWSysReg<"TRCVIIECTLR", 0b10, 0b001, 0b0000, 0b0001, 0b010>; +def : RWSysReg<"TRCVISSCTLR", 0b10, 0b001, 0b0000, 0b0010, 0b010>; +def : RWSysReg<"TRCVIPCSSCTLR", 0b10, 0b001, 0b0000, 0b0011, 0b010>; +def : RWSysReg<"TRCVDCTLR", 0b10, 0b001, 0b0000, 0b1000, 0b010>; +def : RWSysReg<"TRCVDSACCTLR", 0b10, 0b001, 0b0000, 0b1001, 0b010>; +def : RWSysReg<"TRCVDARCCTLR", 0b10, 0b001, 0b0000, 0b1010, 0b010>; +def : RWSysReg<"TRCSEQEVR0", 0b10, 0b001, 0b0000, 0b0000, 0b100>; +def : RWSysReg<"TRCSEQEVR1", 0b10, 0b001, 0b0000, 0b0001, 0b100>; +def : RWSysReg<"TRCSEQEVR2", 0b10, 0b001, 0b0000, 0b0010, 0b100>; +def : RWSysReg<"TRCSEQRSTEVR", 0b10, 0b001, 0b0000, 0b0110, 0b100>; +def : RWSysReg<"TRCSEQSTR", 0b10, 0b001, 0b0000, 0b0111, 0b100>; +def : RWSysReg<"TRCEXTINSELR", 0b10, 0b001, 0b0000, 0b1000, 0b100>; +def : RWSysReg<"TRCCNTRLDVR0", 0b10, 0b001, 0b0000, 0b0000, 0b101>; +def : RWSysReg<"TRCCNTRLDVR1", 0b10, 0b001, 0b0000, 0b0001, 0b101>; +def : RWSysReg<"TRCCNTRLDVR2", 0b10, 0b001, 0b0000, 0b0010, 0b101>; +def : RWSysReg<"TRCCNTRLDVR3", 0b10, 0b001, 0b0000, 0b0011, 0b101>; +def : RWSysReg<"TRCCNTCTLR0", 0b10, 0b001, 0b0000, 0b0100, 0b101>; +def : RWSysReg<"TRCCNTCTLR1", 0b10, 0b001, 0b0000, 0b0101, 0b101>; +def : RWSysReg<"TRCCNTCTLR2", 0b10, 0b001, 0b0000, 0b0110, 0b101>; +def : RWSysReg<"TRCCNTCTLR3", 0b10, 0b001, 0b0000, 0b0111, 0b101>; +def : RWSysReg<"TRCCNTVR0", 0b10, 0b001, 0b0000, 0b1000, 0b101>; +def : RWSysReg<"TRCCNTVR1", 0b10, 0b001, 0b0000, 0b1001, 0b101>; +def : RWSysReg<"TRCCNTVR2", 0b10, 0b001, 0b0000, 0b1010, 0b101>; +def : RWSysReg<"TRCCNTVR3", 0b10, 0b001, 0b0000, 0b1011, 0b101>; +def : RWSysReg<"TRCIMSPEC0", 0b10, 0b001, 0b0000, 0b0000, 0b111>; +def : RWSysReg<"TRCIMSPEC1", 0b10, 0b001, 0b0000, 0b0001, 0b111>; +def : RWSysReg<"TRCIMSPEC2", 0b10, 0b001, 0b0000, 0b0010, 0b111>; +def : RWSysReg<"TRCIMSPEC3", 0b10, 0b001, 0b0000, 0b0011, 0b111>; +def : RWSysReg<"TRCIMSPEC4", 0b10, 0b001, 0b0000, 0b0100, 0b111>; +def : RWSysReg<"TRCIMSPEC5", 0b10, 0b001, 0b0000, 0b0101, 0b111>; +def : RWSysReg<"TRCIMSPEC6", 0b10, 0b001, 0b0000, 0b0110, 0b111>; +def : RWSysReg<"TRCIMSPEC7", 0b10, 0b001, 0b0000, 0b0111, 0b111>; +def : RWSysReg<"TRCRSCTLR2", 0b10, 0b001, 0b0001, 0b0010, 0b000>; +def : RWSysReg<"TRCRSCTLR3", 0b10, 0b001, 0b0001, 0b0011, 0b000>; +def : RWSysReg<"TRCRSCTLR4", 0b10, 0b001, 0b0001, 0b0100, 0b000>; +def : RWSysReg<"TRCRSCTLR5", 0b10, 0b001, 0b0001, 0b0101, 0b000>; +def : RWSysReg<"TRCRSCTLR6", 0b10, 0b001, 0b0001, 0b0110, 0b000>; +def : RWSysReg<"TRCRSCTLR7", 0b10, 0b001, 0b0001, 0b0111, 0b000>; +def : RWSysReg<"TRCRSCTLR8", 0b10, 0b001, 0b0001, 0b1000, 0b000>; +def : RWSysReg<"TRCRSCTLR9", 0b10, 0b001, 0b0001, 0b1001, 0b000>; +def : RWSysReg<"TRCRSCTLR10", 0b10, 0b001, 0b0001, 0b1010, 0b000>; +def : RWSysReg<"TRCRSCTLR11", 0b10, 0b001, 0b0001, 0b1011, 0b000>; +def : RWSysReg<"TRCRSCTLR12", 0b10, 0b001, 0b0001, 0b1100, 0b000>; +def : RWSysReg<"TRCRSCTLR13", 0b10, 0b001, 0b0001, 0b1101, 0b000>; +def : RWSysReg<"TRCRSCTLR14", 0b10, 0b001, 0b0001, 0b1110, 0b000>; +def : RWSysReg<"TRCRSCTLR15", 0b10, 0b001, 0b0001, 0b1111, 0b000>; +def : RWSysReg<"TRCRSCTLR16", 0b10, 0b001, 0b0001, 0b0000, 0b001>; +def : RWSysReg<"TRCRSCTLR17", 0b10, 0b001, 0b0001, 0b0001, 0b001>; +def : RWSysReg<"TRCRSCTLR18", 0b10, 0b001, 0b0001, 0b0010, 0b001>; +def : RWSysReg<"TRCRSCTLR19", 0b10, 0b001, 0b0001, 0b0011, 0b001>; +def : RWSysReg<"TRCRSCTLR20", 0b10, 0b001, 0b0001, 0b0100, 0b001>; +def : RWSysReg<"TRCRSCTLR21", 0b10, 0b001, 0b0001, 0b0101, 0b001>; +def : RWSysReg<"TRCRSCTLR22", 0b10, 0b001, 0b0001, 0b0110, 0b001>; +def : RWSysReg<"TRCRSCTLR23", 0b10, 0b001, 0b0001, 0b0111, 0b001>; +def : RWSysReg<"TRCRSCTLR24", 0b10, 0b001, 0b0001, 0b1000, 0b001>; +def : RWSysReg<"TRCRSCTLR25", 0b10, 0b001, 0b0001, 0b1001, 0b001>; +def : RWSysReg<"TRCRSCTLR26", 0b10, 0b001, 0b0001, 0b1010, 0b001>; +def : RWSysReg<"TRCRSCTLR27", 0b10, 0b001, 0b0001, 0b1011, 0b001>; +def : RWSysReg<"TRCRSCTLR28", 0b10, 0b001, 0b0001, 0b1100, 0b001>; +def : RWSysReg<"TRCRSCTLR29", 0b10, 0b001, 0b0001, 0b1101, 0b001>; +def : RWSysReg<"TRCRSCTLR30", 0b10, 0b001, 0b0001, 0b1110, 0b001>; +def : RWSysReg<"TRCRSCTLR31", 0b10, 0b001, 0b0001, 0b1111, 0b001>; +def : RWSysReg<"TRCSSCCR0", 0b10, 0b001, 0b0001, 0b0000, 0b010>; +def : RWSysReg<"TRCSSCCR1", 0b10, 0b001, 0b0001, 0b0001, 0b010>; +def : RWSysReg<"TRCSSCCR2", 0b10, 0b001, 0b0001, 0b0010, 0b010>; +def : RWSysReg<"TRCSSCCR3", 0b10, 0b001, 0b0001, 0b0011, 0b010>; +def : RWSysReg<"TRCSSCCR4", 0b10, 0b001, 0b0001, 0b0100, 0b010>; +def : RWSysReg<"TRCSSCCR5", 0b10, 0b001, 0b0001, 0b0101, 0b010>; +def : RWSysReg<"TRCSSCCR6", 0b10, 0b001, 0b0001, 0b0110, 0b010>; +def : RWSysReg<"TRCSSCCR7", 0b10, 0b001, 0b0001, 0b0111, 0b010>; +def : RWSysReg<"TRCSSCSR0", 0b10, 0b001, 0b0001, 0b1000, 0b010>; +def : RWSysReg<"TRCSSCSR1", 0b10, 0b001, 0b0001, 0b1001, 0b010>; +def : RWSysReg<"TRCSSCSR2", 0b10, 0b001, 0b0001, 0b1010, 0b010>; +def : RWSysReg<"TRCSSCSR3", 0b10, 0b001, 0b0001, 0b1011, 0b010>; +def : RWSysReg<"TRCSSCSR4", 0b10, 0b001, 0b0001, 0b1100, 0b010>; +def : RWSysReg<"TRCSSCSR5", 0b10, 0b001, 0b0001, 0b1101, 0b010>; +def : RWSysReg<"TRCSSCSR6", 0b10, 0b001, 0b0001, 0b1110, 0b010>; +def : RWSysReg<"TRCSSCSR7", 0b10, 0b001, 0b0001, 0b1111, 0b010>; +def : RWSysReg<"TRCSSPCICR0", 0b10, 0b001, 0b0001, 0b0000, 0b011>; +def : RWSysReg<"TRCSSPCICR1", 0b10, 0b001, 0b0001, 0b0001, 0b011>; +def : RWSysReg<"TRCSSPCICR2", 0b10, 0b001, 0b0001, 0b0010, 0b011>; +def : RWSysReg<"TRCSSPCICR3", 0b10, 0b001, 0b0001, 0b0011, 0b011>; +def : RWSysReg<"TRCSSPCICR4", 0b10, 0b001, 0b0001, 0b0100, 0b011>; +def : RWSysReg<"TRCSSPCICR5", 0b10, 0b001, 0b0001, 0b0101, 0b011>; +def : RWSysReg<"TRCSSPCICR6", 0b10, 0b001, 0b0001, 0b0110, 0b011>; +def : RWSysReg<"TRCSSPCICR7", 0b10, 0b001, 0b0001, 0b0111, 0b011>; +def : RWSysReg<"TRCPDCR", 0b10, 0b001, 0b0001, 0b0100, 0b100>; +def : RWSysReg<"TRCACVR0", 0b10, 0b001, 0b0010, 0b0000, 0b000>; +def : RWSysReg<"TRCACVR1", 0b10, 0b001, 0b0010, 0b0010, 0b000>; +def : RWSysReg<"TRCACVR2", 0b10, 0b001, 0b0010, 0b0100, 0b000>; +def : RWSysReg<"TRCACVR3", 0b10, 0b001, 0b0010, 0b0110, 0b000>; +def : RWSysReg<"TRCACVR4", 0b10, 0b001, 0b0010, 0b1000, 0b000>; +def : RWSysReg<"TRCACVR5", 0b10, 0b001, 0b0010, 0b1010, 0b000>; +def : RWSysReg<"TRCACVR6", 0b10, 0b001, 0b0010, 0b1100, 0b000>; +def : RWSysReg<"TRCACVR7", 0b10, 0b001, 0b0010, 0b1110, 0b000>; +def : RWSysReg<"TRCACVR8", 0b10, 0b001, 0b0010, 0b0000, 0b001>; +def : RWSysReg<"TRCACVR9", 0b10, 0b001, 0b0010, 0b0010, 0b001>; +def : RWSysReg<"TRCACVR10", 0b10, 0b001, 0b0010, 0b0100, 0b001>; +def : RWSysReg<"TRCACVR11", 0b10, 0b001, 0b0010, 0b0110, 0b001>; +def : RWSysReg<"TRCACVR12", 0b10, 0b001, 0b0010, 0b1000, 0b001>; +def : RWSysReg<"TRCACVR13", 0b10, 0b001, 0b0010, 0b1010, 0b001>; +def : RWSysReg<"TRCACVR14", 0b10, 0b001, 0b0010, 0b1100, 0b001>; +def : RWSysReg<"TRCACVR15", 0b10, 0b001, 0b0010, 0b1110, 0b001>; +def : RWSysReg<"TRCACATR0", 0b10, 0b001, 0b0010, 0b0000, 0b010>; +def : RWSysReg<"TRCACATR1", 0b10, 0b001, 0b0010, 0b0010, 0b010>; +def : RWSysReg<"TRCACATR2", 0b10, 0b001, 0b0010, 0b0100, 0b010>; +def : RWSysReg<"TRCACATR3", 0b10, 0b001, 0b0010, 0b0110, 0b010>; +def : RWSysReg<"TRCACATR4", 0b10, 0b001, 0b0010, 0b1000, 0b010>; +def : RWSysReg<"TRCACATR5", 0b10, 0b001, 0b0010, 0b1010, 0b010>; +def : RWSysReg<"TRCACATR6", 0b10, 0b001, 0b0010, 0b1100, 0b010>; +def : RWSysReg<"TRCACATR7", 0b10, 0b001, 0b0010, 0b1110, 0b010>; +def : RWSysReg<"TRCACATR8", 0b10, 0b001, 0b0010, 0b0000, 0b011>; +def : RWSysReg<"TRCACATR9", 0b10, 0b001, 0b0010, 0b0010, 0b011>; +def : RWSysReg<"TRCACATR10", 0b10, 0b001, 0b0010, 0b0100, 0b011>; +def : RWSysReg<"TRCACATR11", 0b10, 0b001, 0b0010, 0b0110, 0b011>; +def : RWSysReg<"TRCACATR12", 0b10, 0b001, 0b0010, 0b1000, 0b011>; +def : RWSysReg<"TRCACATR13", 0b10, 0b001, 0b0010, 0b1010, 0b011>; +def : RWSysReg<"TRCACATR14", 0b10, 0b001, 0b0010, 0b1100, 0b011>; +def : RWSysReg<"TRCACATR15", 0b10, 0b001, 0b0010, 0b1110, 0b011>; +def : RWSysReg<"TRCDVCVR0", 0b10, 0b001, 0b0010, 0b0000, 0b100>; +def : RWSysReg<"TRCDVCVR1", 0b10, 0b001, 0b0010, 0b0100, 0b100>; +def : RWSysReg<"TRCDVCVR2", 0b10, 0b001, 0b0010, 0b1000, 0b100>; +def : RWSysReg<"TRCDVCVR3", 0b10, 0b001, 0b0010, 0b1100, 0b100>; +def : RWSysReg<"TRCDVCVR4", 0b10, 0b001, 0b0010, 0b0000, 0b101>; +def : RWSysReg<"TRCDVCVR5", 0b10, 0b001, 0b0010, 0b0100, 0b101>; +def : RWSysReg<"TRCDVCVR6", 0b10, 0b001, 0b0010, 0b1000, 0b101>; +def : RWSysReg<"TRCDVCVR7", 0b10, 0b001, 0b0010, 0b1100, 0b101>; +def : RWSysReg<"TRCDVCMR0", 0b10, 0b001, 0b0010, 0b0000, 0b110>; +def : RWSysReg<"TRCDVCMR1", 0b10, 0b001, 0b0010, 0b0100, 0b110>; +def : RWSysReg<"TRCDVCMR2", 0b10, 0b001, 0b0010, 0b1000, 0b110>; +def : RWSysReg<"TRCDVCMR3", 0b10, 0b001, 0b0010, 0b1100, 0b110>; +def : RWSysReg<"TRCDVCMR4", 0b10, 0b001, 0b0010, 0b0000, 0b111>; +def : RWSysReg<"TRCDVCMR5", 0b10, 0b001, 0b0010, 0b0100, 0b111>; +def : RWSysReg<"TRCDVCMR6", 0b10, 0b001, 0b0010, 0b1000, 0b111>; +def : RWSysReg<"TRCDVCMR7", 0b10, 0b001, 0b0010, 0b1100, 0b111>; +def : RWSysReg<"TRCCIDCVR0", 0b10, 0b001, 0b0011, 0b0000, 0b000>; +def : RWSysReg<"TRCCIDCVR1", 0b10, 0b001, 0b0011, 0b0010, 0b000>; +def : RWSysReg<"TRCCIDCVR2", 0b10, 0b001, 0b0011, 0b0100, 0b000>; +def : RWSysReg<"TRCCIDCVR3", 0b10, 0b001, 0b0011, 0b0110, 0b000>; +def : RWSysReg<"TRCCIDCVR4", 0b10, 0b001, 0b0011, 0b1000, 0b000>; +def : RWSysReg<"TRCCIDCVR5", 0b10, 0b001, 0b0011, 0b1010, 0b000>; +def : RWSysReg<"TRCCIDCVR6", 0b10, 0b001, 0b0011, 0b1100, 0b000>; +def : RWSysReg<"TRCCIDCVR7", 0b10, 0b001, 0b0011, 0b1110, 0b000>; +def : RWSysReg<"TRCVMIDCVR0", 0b10, 0b001, 0b0011, 0b0000, 0b001>; +def : RWSysReg<"TRCVMIDCVR1", 0b10, 0b001, 0b0011, 0b0010, 0b001>; +def : RWSysReg<"TRCVMIDCVR2", 0b10, 0b001, 0b0011, 0b0100, 0b001>; +def : RWSysReg<"TRCVMIDCVR3", 0b10, 0b001, 0b0011, 0b0110, 0b001>; +def : RWSysReg<"TRCVMIDCVR4", 0b10, 0b001, 0b0011, 0b1000, 0b001>; +def : RWSysReg<"TRCVMIDCVR5", 0b10, 0b001, 0b0011, 0b1010, 0b001>; +def : RWSysReg<"TRCVMIDCVR6", 0b10, 0b001, 0b0011, 0b1100, 0b001>; +def : RWSysReg<"TRCVMIDCVR7", 0b10, 0b001, 0b0011, 0b1110, 0b001>; +def : RWSysReg<"TRCCIDCCTLR0", 0b10, 0b001, 0b0011, 0b0000, 0b010>; +def : RWSysReg<"TRCCIDCCTLR1", 0b10, 0b001, 0b0011, 0b0001, 0b010>; +def : RWSysReg<"TRCVMIDCCTLR0", 0b10, 0b001, 0b0011, 0b0010, 0b010>; +def : RWSysReg<"TRCVMIDCCTLR1", 0b10, 0b001, 0b0011, 0b0011, 0b010>; +def : RWSysReg<"TRCITCTRL", 0b10, 0b001, 0b0111, 0b0000, 0b100>; +def : RWSysReg<"TRCCLAIMSET", 0b10, 0b001, 0b0111, 0b1000, 0b110>; +def : RWSysReg<"TRCCLAIMCLR", 0b10, 0b001, 0b0111, 0b1001, 0b110>; + +// GICv3 registers +// Op0 Op1 CRn CRm Op2 +def : RWSysReg<"ICC_BPR1_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b011>; +def : RWSysReg<"ICC_BPR0_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b011>; +def : RWSysReg<"ICC_PMR_EL1", 0b11, 0b000, 0b0100, 0b0110, 0b000>; +def : RWSysReg<"ICC_CTLR_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b100>; +def : RWSysReg<"ICC_CTLR_EL3", 0b11, 0b110, 0b1100, 0b1100, 0b100>; +def : RWSysReg<"ICC_SRE_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b101>; +def : RWSysReg<"ICC_SRE_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b101>; +def : RWSysReg<"ICC_SRE_EL3", 0b11, 0b110, 0b1100, 0b1100, 0b101>; +def : RWSysReg<"ICC_IGRPEN0_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b110>; +def : RWSysReg<"ICC_IGRPEN1_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b111>; +def : RWSysReg<"ICC_IGRPEN1_EL3", 0b11, 0b110, 0b1100, 0b1100, 0b111>; +def : RWSysReg<"ICC_SEIEN_EL1", 0b11, 0b000, 0b1100, 0b1101, 0b000>; +def : RWSysReg<"ICC_AP0R0_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b100>; +def : RWSysReg<"ICC_AP0R1_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b101>; +def : RWSysReg<"ICC_AP0R2_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b110>; +def : RWSysReg<"ICC_AP0R3_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b111>; +def : RWSysReg<"ICC_AP1R0_EL1", 0b11, 0b000, 0b1100, 0b1001, 0b000>; +def : RWSysReg<"ICC_AP1R1_EL1", 0b11, 0b000, 0b1100, 0b1001, 0b001>; +def : RWSysReg<"ICC_AP1R2_EL1", 0b11, 0b000, 0b1100, 0b1001, 0b010>; +def : RWSysReg<"ICC_AP1R3_EL1", 0b11, 0b000, 0b1100, 0b1001, 0b011>; +def : RWSysReg<"ICH_AP0R0_EL2", 0b11, 0b100, 0b1100, 0b1000, 0b000>; +def : RWSysReg<"ICH_AP0R1_EL2", 0b11, 0b100, 0b1100, 0b1000, 0b001>; +def : RWSysReg<"ICH_AP0R2_EL2", 0b11, 0b100, 0b1100, 0b1000, 0b010>; +def : RWSysReg<"ICH_AP0R3_EL2", 0b11, 0b100, 0b1100, 0b1000, 0b011>; +def : RWSysReg<"ICH_AP1R0_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b000>; +def : RWSysReg<"ICH_AP1R1_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b001>; +def : RWSysReg<"ICH_AP1R2_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b010>; +def : RWSysReg<"ICH_AP1R3_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b011>; +def : RWSysReg<"ICH_HCR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b000>; +def : RWSysReg<"ICH_MISR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b010>; +def : RWSysReg<"ICH_VMCR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b111>; +def : RWSysReg<"ICH_VSEIR_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b100>; +def : RWSysReg<"ICH_LR0_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b000>; +def : RWSysReg<"ICH_LR1_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b001>; +def : RWSysReg<"ICH_LR2_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b010>; +def : RWSysReg<"ICH_LR3_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b011>; +def : RWSysReg<"ICH_LR4_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b100>; +def : RWSysReg<"ICH_LR5_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b101>; +def : RWSysReg<"ICH_LR6_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b110>; +def : RWSysReg<"ICH_LR7_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b111>; +def : RWSysReg<"ICH_LR8_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b000>; +def : RWSysReg<"ICH_LR9_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b001>; +def : RWSysReg<"ICH_LR10_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b010>; +def : RWSysReg<"ICH_LR11_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b011>; +def : RWSysReg<"ICH_LR12_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b100>; +def : RWSysReg<"ICH_LR13_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b101>; +def : RWSysReg<"ICH_LR14_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b110>; +def : RWSysReg<"ICH_LR15_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b111>; + +// v8.1a "Privileged Access Never" extension-specific system registers +let Requires = [{ {AArch64::HasV8_1aOps} }] in +def : RWSysReg<"PAN", 0b11, 0b000, 0b0100, 0b0010, 0b011>; + +// v8.1a "Limited Ordering Regions" extension-specific system registers +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::HasV8_1aOps} }] in { +def : RWSysReg<"LORSA_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b000>; +def : RWSysReg<"LOREA_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b001>; +def : RWSysReg<"LORN_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b010>; +def : RWSysReg<"LORC_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b011>; +} + +// v8.1a "Virtualization hos extensions" system registers +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::HasV8_1aOps} }] in { +def : RWSysReg<"TTBR1_EL2", 0b11, 0b100, 0b0010, 0b0000, 0b001>; +def : RWSysReg<"CONTEXTIDR_EL2", 0b11, 0b100, 0b1101, 0b0000, 0b001>; +def : RWSysReg<"CNTHV_TVAL_EL2", 0b11, 0b100, 0b1110, 0b0011, 0b000>; +def : RWSysReg<"CNTHV_CVAL_EL2", 0b11, 0b100, 0b1110, 0b0011, 0b010>; +def : RWSysReg<"CNTHV_CTL_EL2", 0b11, 0b100, 0b1110, 0b0011, 0b001>; +def : RWSysReg<"SCTLR_EL12", 0b11, 0b101, 0b0001, 0b0000, 0b000>; +def : RWSysReg<"CPACR_EL12", 0b11, 0b101, 0b0001, 0b0000, 0b010>; +def : RWSysReg<"TTBR0_EL12", 0b11, 0b101, 0b0010, 0b0000, 0b000>; +def : RWSysReg<"TTBR1_EL12", 0b11, 0b101, 0b0010, 0b0000, 0b001>; +def : RWSysReg<"TCR_EL12", 0b11, 0b101, 0b0010, 0b0000, 0b010>; +def : RWSysReg<"AFSR0_EL12", 0b11, 0b101, 0b0101, 0b0001, 0b000>; +def : RWSysReg<"AFSR1_EL12", 0b11, 0b101, 0b0101, 0b0001, 0b001>; +def : RWSysReg<"ESR_EL12", 0b11, 0b101, 0b0101, 0b0010, 0b000>; +def : RWSysReg<"FAR_EL12", 0b11, 0b101, 0b0110, 0b0000, 0b000>; +def : RWSysReg<"MAIR_EL12", 0b11, 0b101, 0b1010, 0b0010, 0b000>; +def : RWSysReg<"AMAIR_EL12", 0b11, 0b101, 0b1010, 0b0011, 0b000>; +def : RWSysReg<"VBAR_EL12", 0b11, 0b101, 0b1100, 0b0000, 0b000>; +def : RWSysReg<"CONTEXTIDR_EL12", 0b11, 0b101, 0b1101, 0b0000, 0b001>; +def : RWSysReg<"CNTKCTL_EL12", 0b11, 0b101, 0b1110, 0b0001, 0b000>; +def : RWSysReg<"CNTP_TVAL_EL02", 0b11, 0b101, 0b1110, 0b0010, 0b000>; +def : RWSysReg<"CNTP_CTL_EL02", 0b11, 0b101, 0b1110, 0b0010, 0b001>; +def : RWSysReg<"CNTP_CVAL_EL02", 0b11, 0b101, 0b1110, 0b0010, 0b010>; +def : RWSysReg<"CNTV_TVAL_EL02", 0b11, 0b101, 0b1110, 0b0011, 0b000>; +def : RWSysReg<"CNTV_CTL_EL02", 0b11, 0b101, 0b1110, 0b0011, 0b001>; +def : RWSysReg<"CNTV_CVAL_EL02", 0b11, 0b101, 0b1110, 0b0011, 0b010>; +def : RWSysReg<"SPSR_EL12", 0b11, 0b101, 0b0100, 0b0000, 0b000>; +def : RWSysReg<"ELR_EL12", 0b11, 0b101, 0b0100, 0b0000, 0b001>; +} +// v8.2a registers +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::HasV8_2aOps} }] in +def : RWSysReg<"UAO", 0b11, 0b000, 0b0100, 0b0010, 0b100>; + +// v8.2a "Statistical Profiling extension" registers +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureSPE} }] in { +def : RWSysReg<"PMBLIMITR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b000>; +def : RWSysReg<"PMBPTR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b001>; +def : RWSysReg<"PMBSR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b011>; +def : RWSysReg<"PMBIDR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b111>; +def : RWSysReg<"PMSCR_EL2", 0b11, 0b100, 0b1001, 0b1001, 0b000>; +def : RWSysReg<"PMSCR_EL12", 0b11, 0b101, 0b1001, 0b1001, 0b000>; +def : RWSysReg<"PMSCR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b000>; +def : RWSysReg<"PMSICR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b010>; +def : RWSysReg<"PMSIRR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b011>; +def : RWSysReg<"PMSFCR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b100>; +def : RWSysReg<"PMSEVFR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b101>; +def : RWSysReg<"PMSLATFR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b110>; +def : RWSysReg<"PMSIDR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b111>; +} + +// v8.2a "RAS extension" registers +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureRAS} }] in { +def : RWSysReg<"ERRSELR_EL1", 0b11, 0b000, 0b0101, 0b0011, 0b001>; +def : RWSysReg<"ERXCTLR_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b001>; +def : RWSysReg<"ERXSTATUS_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b010>; +def : RWSysReg<"ERXADDR_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b011>; +def : RWSysReg<"ERXMISC0_EL1", 0b11, 0b000, 0b0101, 0b0101, 0b000>; +def : RWSysReg<"ERXMISC1_EL1", 0b11, 0b000, 0b0101, 0b0101, 0b001>; +def : RWSysReg<"DISR_EL1", 0b11, 0b000, 0b1100, 0b0001, 0b001>; +def : RWSysReg<"VDISR_EL2", 0b11, 0b100, 0b1100, 0b0001, 0b001>; +def : RWSysReg<"VSESR_EL2", 0b11, 0b100, 0b0101, 0b0010, 0b011>; +} + +// Cyclone specific system registers +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::ProcCyclone} }] in +def : RWSysReg<"CPM_IOACC_CTL_EL3", 0b11, 0b111, 0b1111, 0b0010, 0b000>; diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index c52c5544fc7e2..0b6345ff8011b 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -11,13 +11,19 @@ //===----------------------------------------------------------------------===// #include "AArch64.h" +#include "AArch64CallLowering.h" +#include "AArch64RegisterBankInfo.h" #include "AArch64TargetMachine.h" #include "AArch64TargetObjectFile.h" #include "AArch64TargetTransformInfo.h" +#include "llvm/CodeGen/GlobalISel/IRTranslator.h" +#include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/RegAllocRegistry.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Function.h" #include "llvm/IR/LegacyPassManager.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Target/TargetOptions.h" @@ -58,6 +64,11 @@ EnableDeadRegisterElimination("aarch64-dead-def-elimination", cl::Hidden, cl::init(true)); static cl::opt<bool> +EnableRedundantCopyElimination("aarch64-redundant-copy-elim", + cl::desc("Enable the redundant copy elimination pass"), + cl::init(true), cl::Hidden); + +static cl::opt<bool> EnableLoadStoreOpt("aarch64-load-store-opt", cl::desc("Enable the load/store pair" " optimization pass"), cl::init(true), cl::Hidden); @@ -92,11 +103,19 @@ static cl::opt<cl::boolOrDefault> EnableGlobalMerge("aarch64-global-merge", cl::Hidden, cl::desc("Enable the global merge pass")); +static cl::opt<bool> + EnableLoopDataPrefetch("aarch64-loop-data-prefetch", cl::Hidden, + cl::desc("Enable the loop data prefetch pass"), + cl::init(true)); + extern "C" void LLVMInitializeAArch64Target() { // Register the target. RegisterTargetMachine<AArch64leTargetMachine> X(TheAArch64leTarget); RegisterTargetMachine<AArch64beTargetMachine> Y(TheAArch64beTarget); RegisterTargetMachine<AArch64leTargetMachine> Z(TheARM64Target); + auto PR = PassRegistry::getPassRegistry(); + initializeGlobalISel(*PR); + initializeAArch64ExpandPseudoPass(*PR); } //===----------------------------------------------------------------------===// @@ -114,29 +133,79 @@ static std::string computeDataLayout(const Triple &TT, bool LittleEndian) { if (TT.isOSBinFormatMachO()) return "e-m:o-i64:64-i128:128-n32:64-S128"; if (LittleEndian) - return "e-m:e-i64:64-i128:128-n32:64-S128"; - return "E-m:e-i64:64-i128:128-n32:64-S128"; + return "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"; + return "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"; } -/// TargetMachine ctor - Create an AArch64 architecture model. +// Helper function to set up the defaults for reciprocals. +static void initReciprocals(AArch64TargetMachine& TM, AArch64Subtarget& ST) +{ + // For the estimates, convergence is quadratic, so essentially the number of + // digits is doubled after each iteration. ARMv8, the minimum architected + // accuracy of the initial estimate is 2^-8. Therefore, the number of extra + // steps to refine the result for float (23 mantissa bits) and for double + // (52 mantissa bits) are 2 and 3, respectively. + unsigned ExtraStepsF = 2, + ExtraStepsD = ExtraStepsF + 1; + bool UseRsqrt = ST.useRSqrt(); + + TM.Options.Reciprocals.setDefaults("sqrtf", UseRsqrt, ExtraStepsF); + TM.Options.Reciprocals.setDefaults("sqrtd", UseRsqrt, ExtraStepsD); + TM.Options.Reciprocals.setDefaults("vec-sqrtf", UseRsqrt, ExtraStepsF); + TM.Options.Reciprocals.setDefaults("vec-sqrtd", UseRsqrt, ExtraStepsD); + + TM.Options.Reciprocals.setDefaults("divf", false, ExtraStepsF); + TM.Options.Reciprocals.setDefaults("divd", false, ExtraStepsD); + TM.Options.Reciprocals.setDefaults("vec-divf", false, ExtraStepsF); + TM.Options.Reciprocals.setDefaults("vec-divd", false, ExtraStepsD); +} + +static Reloc::Model getEffectiveRelocModel(const Triple &TT, + Optional<Reloc::Model> RM) { + // AArch64 Darwin is always PIC. + if (TT.isOSDarwin()) + return Reloc::PIC_; + // On ELF platforms the default static relocation model has a smart enough + // linker to cope with referencing external symbols defined in a shared + // library. Hence DynamicNoPIC doesn't need to be promoted to PIC. + if (!RM.hasValue() || *RM == Reloc::DynamicNoPIC) + return Reloc::Static; + return *RM; +} + +/// Create an AArch64 architecture model. /// -AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT, - StringRef CPU, StringRef FS, - const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL, - bool LittleEndian) +AArch64TargetMachine::AArch64TargetMachine( + const Target &T, const Triple &TT, StringRef CPU, StringRef FS, + const TargetOptions &Options, Optional<Reloc::Model> RM, + CodeModel::Model CM, CodeGenOpt::Level OL, bool LittleEndian) // This nested ternary is horrible, but DL needs to be properly // initialized before TLInfo is constructed. : LLVMTargetMachine(T, computeDataLayout(TT, LittleEndian), TT, CPU, FS, - Options, RM, CM, OL), + Options, getEffectiveRelocModel(TT, RM), CM, OL), TLOF(createTLOF(getTargetTriple())), - isLittle(LittleEndian) { + Subtarget(TT, CPU, FS, *this, LittleEndian) { + initReciprocals(*this, Subtarget); initAsmInfo(); } AArch64TargetMachine::~AArch64TargetMachine() {} +#ifdef LLVM_BUILD_GLOBAL_ISEL +namespace { +struct AArch64GISelActualAccessor : public GISelAccessor { + std::unique_ptr<CallLowering> CallLoweringInfo; + std::unique_ptr<RegisterBankInfo> RegBankInfo; + const CallLowering *getCallLowering() const override { + return CallLoweringInfo.get(); + } + const RegisterBankInfo *getRegBankInfo() const override { + return RegBankInfo.get(); + } +}; +} // End anonymous namespace. +#endif + const AArch64Subtarget * AArch64TargetMachine::getSubtargetImpl(const Function &F) const { Attribute CPUAttr = F.getFnAttribute("target-cpu"); @@ -156,7 +225,18 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const { // function that reside in TargetOptions. resetTargetOptions(F); I = llvm::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this, - isLittle); + Subtarget.isLittleEndian()); +#ifndef LLVM_BUILD_GLOBAL_ISEL + GISelAccessor *GISel = new GISelAccessor(); +#else + AArch64GISelActualAccessor *GISel = + new AArch64GISelActualAccessor(); + GISel->CallLoweringInfo.reset( + new AArch64CallLowering(*I->getTargetLowering())); + GISel->RegBankInfo.reset( + new AArch64RegisterBankInfo(*I->getRegisterInfo())); +#endif + I->setGISelAccessor(*GISel); } return I.get(); } @@ -165,16 +245,16 @@ void AArch64leTargetMachine::anchor() { } AArch64leTargetMachine::AArch64leTargetMachine( const Target &T, const Triple &TT, StringRef CPU, StringRef FS, - const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL) + const TargetOptions &Options, Optional<Reloc::Model> RM, + CodeModel::Model CM, CodeGenOpt::Level OL) : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} void AArch64beTargetMachine::anchor() { } AArch64beTargetMachine::AArch64beTargetMachine( const Target &T, const Triple &TT, StringRef CPU, StringRef FS, - const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL) + const TargetOptions &Options, Optional<Reloc::Model> RM, + CodeModel::Model CM, CodeGenOpt::Level OL) : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} namespace { @@ -194,6 +274,10 @@ public: void addIRPasses() override; bool addPreISel() override; bool addInstSelector() override; +#ifdef LLVM_BUILD_GLOBAL_ISEL + bool addIRTranslator() override; + bool addRegBankSelect() override; +#endif bool addILPOpts() override; void addPreRegAlloc() override; void addPostRegAlloc() override; @@ -223,6 +307,13 @@ void AArch64PassConfig::addIRPasses() { if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy) addPass(createCFGSimplificationPass()); + // Run LoopDataPrefetch + // + // Run this before LSR to remove the multiplies involved in computing the + // pointer values N iterations ahead. + if (TM->getOptLevel() != CodeGenOpt::None && EnableLoopDataPrefetch) + addPass(createLoopDataPrefetchPass()); + TargetPassConfig::addIRPasses(); // Match interleaved memory accesses to ldN/stN intrinsics. @@ -278,6 +369,17 @@ bool AArch64PassConfig::addInstSelector() { return false; } +#ifdef LLVM_BUILD_GLOBAL_ISEL +bool AArch64PassConfig::addIRTranslator() { + addPass(new IRTranslator()); + return false; +} +bool AArch64PassConfig::addRegBankSelect() { + addPass(new RegBankSelect()); + return false; +} +#endif + bool AArch64PassConfig::addILPOpts() { if (EnableCondOpt) addPass(createAArch64ConditionOptimizerPass()); @@ -303,6 +405,10 @@ void AArch64PassConfig::addPreRegAlloc() { } void AArch64PassConfig::addPostRegAlloc() { + // Remove redundant copy instructions. + if (TM->getOptLevel() != CodeGenOpt::None && EnableRedundantCopyElimination) + addPass(createAArch64RedundantCopyEliminationPass()); + // Change dead register definitions to refer to the zero register. if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination) addPass(createAArch64DeadRegisterDefinitions()); diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h index 8d49a29386ac8..b44107b065bd0 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.h +++ b/lib/Target/AArch64/AArch64TargetMachine.h @@ -29,7 +29,7 @@ protected: public: AArch64TargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, + Optional<Reloc::Model> RM, CodeModel::Model CM, CodeGenOpt::Level OL, bool IsLittleEndian); ~AArch64TargetMachine() override; @@ -46,28 +46,28 @@ public: } private: - bool isLittle; + AArch64Subtarget Subtarget; }; -// AArch64leTargetMachine - AArch64 little endian target machine. +// AArch64 little endian target machine. // class AArch64leTargetMachine : public AArch64TargetMachine { virtual void anchor(); public: AArch64leTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, + Optional<Reloc::Model> RM, CodeModel::Model CM, CodeGenOpt::Level OL); }; -// AArch64beTargetMachine - AArch64 big endian target machine. +// AArch64 big endian target machine. // class AArch64beTargetMachine : public AArch64TargetMachine { virtual void anchor(); public: AArch64beTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, + Optional<Reloc::Model> RM, CodeModel::Model CM, CodeGenOpt::Level OL); }; diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 9af0e6444789a..ecf4d93068a4e 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -291,6 +291,61 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { return BaseT::getCastInstrCost(Opcode, Dst, Src); } +int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst, + VectorType *VecTy, + unsigned Index) { + + // Make sure we were given a valid extend opcode. + assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && + "Invalid opcode"); + + // We are extending an element we extract from a vector, so the source type + // of the extend is the element type of the vector. + auto *Src = VecTy->getElementType(); + + // Sign- and zero-extends are for integer types only. + assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type"); + + // Get the cost for the extract. We compute the cost (if any) for the extend + // below. + auto Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, Index); + + // Legalize the types. + auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy); + auto DstVT = TLI->getValueType(DL, Dst); + auto SrcVT = TLI->getValueType(DL, Src); + + // If the resulting type is still a vector and the destination type is legal, + // we may get the extension for free. If not, get the default cost for the + // extend. + if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT)) + return Cost + getCastInstrCost(Opcode, Dst, Src); + + // The destination type should be larger than the element type. If not, get + // the default cost for the extend. + if (DstVT.getSizeInBits() < SrcVT.getSizeInBits()) + return Cost + getCastInstrCost(Opcode, Dst, Src); + + switch (Opcode) { + default: + llvm_unreachable("Opcode should be either SExt or ZExt"); + + // For sign-extends, we only need a smov, which performs the extension + // automatically. + case Instruction::SExt: + return Cost; + + // For zero-extends, the extend is performed automatically by a umov unless + // the destination type is i64 and the element type is i8 or i16. + case Instruction::ZExt: + if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u) + return Cost; + } + + // If we are unable to perform the extend for free, get the default cost. + return Cost + getCastInstrCost(Opcode, Dst, Src); +} + int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { assert(Val->isVectorTy() && "This must be a vector type"); @@ -313,7 +368,7 @@ int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, } // All other insert/extracts cost this much. - return 3; + return ST->getVectorInsertExtractBaseCost(); } int AArch64TTIImpl::getArithmeticInstrCost( @@ -472,9 +527,7 @@ int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { } unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) { - if (ST->isCortexA57()) - return 4; - return 2; + return ST->getMaxInterleaveFactor(); } void AArch64TTIImpl::getUnrollingPreferences(Loop *L, @@ -571,3 +624,19 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, } return true; } + +unsigned AArch64TTIImpl::getCacheLineSize() { + return ST->getCacheLineSize(); +} + +unsigned AArch64TTIImpl::getPrefetchDistance() { + return ST->getPrefetchDistance(); +} + +unsigned AArch64TTIImpl::getMinPrefetchStride() { + return ST->getMinPrefetchStride(); +} + +unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() { + return ST->getMaxPrefetchIterationsAhead(); +} diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h index ec58c4fe309f3..4f2e8310d769d 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -99,6 +99,9 @@ public: int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); + int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, + unsigned Index); + int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); int getArithmeticInstrCost( @@ -127,6 +130,14 @@ public: int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment, unsigned AddressSpace); + + unsigned getCacheLineSize(); + + unsigned getPrefetchDistance(); + + unsigned getMinPrefetchStride(); + + unsigned getMaxPrefetchIterationsAhead(); /// @} }; diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 394c8e78581f1..aebc370333e3b 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -13,7 +13,6 @@ #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" @@ -24,13 +23,14 @@ #include "llvm/MC/MCParser/MCAsmLexer.h" #include "llvm/MC/MCParser/MCAsmParser.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/MC/MCParser/MCTargetAsmParser.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/MC/MCTargetAsmParser.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetParser.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" #include <cstdio> @@ -70,6 +70,8 @@ private: bool Error(SMLoc L, const Twine &Msg) { return getParser().Error(L, Msg); } bool showMatchError(SMLoc Loc, unsigned ErrCode); + bool parseDirectiveArch(SMLoc L); + bool parseDirectiveCPU(SMLoc L); bool parseDirectiveWord(unsigned Size, SMLoc L); bool parseDirectiveInst(SMLoc L); @@ -866,14 +868,7 @@ public: if (!CE) return false; uint64_t Value = CE->getValue(); - if (RegWidth == 32) - Value &= 0xffffffffULL; - - // "lsl #0" takes precedence: in practice this only affects "#0, lsl #0". - if (Value == 0 && Shift != 0) - return false; - - return (Value & ~(0xffffULL << Shift)) == 0; + return AArch64_AM::isMOVZMovAlias(Value, Shift, RegWidth); } template<int RegWidth, int Shift> @@ -884,16 +879,7 @@ public: if (!CE) return false; uint64_t Value = CE->getValue(); - // MOVZ takes precedence over MOVN. - for (int MOVZShift = 0; MOVZShift <= 48; MOVZShift += 16) - if ((Value & ~(0xffffULL << MOVZShift)) == 0) - return false; - - Value = ~Value; - if (RegWidth == 32) - Value &= 0xffffffffULL; - - return (Value & ~(0xffffULL << Shift)) == 0; + return AArch64_AM::isMOVNMovAlias(Value, Shift, RegWidth); } bool isFPImm() const { return Kind == k_FPImm; } @@ -2087,12 +2073,9 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) { return MatchOperand_ParseFail; } - bool Valid; - auto Mapper = AArch64PRFM::PRFMMapper(); - StringRef Name = - Mapper.toString(MCE->getValue(), getSTI().getFeatureBits(), Valid); - Operands.push_back(AArch64Operand::CreatePrefetch(prfop, Name, - S, getContext())); + auto PRFM = AArch64PRFM::lookupPRFMByEncoding(MCE->getValue()); + Operands.push_back(AArch64Operand::CreatePrefetch( + prfop, PRFM ? PRFM->Name : "", S, getContext())); return MatchOperand_Success; } @@ -2101,18 +2084,15 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) { return MatchOperand_ParseFail; } - bool Valid; - auto Mapper = AArch64PRFM::PRFMMapper(); - unsigned prfop = - Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid); - if (!Valid) { + auto PRFM = AArch64PRFM::lookupPRFMByName(Tok.getString()); + if (!PRFM) { TokError("pre-fetch hint expected"); return MatchOperand_ParseFail; } Parser.Lex(); // Eat identifier token. - Operands.push_back(AArch64Operand::CreatePrefetch(prfop, Tok.getString(), - S, getContext())); + Operands.push_back(AArch64Operand::CreatePrefetch( + PRFM->Encoding, Tok.getString(), S, getContext())); return MatchOperand_Success; } @@ -2127,18 +2107,15 @@ AArch64AsmParser::tryParsePSBHint(OperandVector &Operands) { return MatchOperand_ParseFail; } - bool Valid; - auto Mapper = AArch64PSBHint::PSBHintMapper(); - unsigned psbhint = - Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid); - if (!Valid) { + auto PSB = AArch64PSBHint::lookupPSBByName(Tok.getString()); + if (!PSB) { TokError("invalid operand for instruction"); return MatchOperand_ParseFail; } Parser.Lex(); // Eat identifier token. - Operands.push_back(AArch64Operand::CreatePSBHint(psbhint, Tok.getString(), - S, getContext())); + Operands.push_back(AArch64Operand::CreatePSBHint( + PSB->Encoding, Tok.getString(), S, getContext())); return MatchOperand_Success; } @@ -2762,12 +2739,9 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) { Error(ExprLoc, "barrier operand out of range"); return MatchOperand_ParseFail; } - bool Valid; - auto Mapper = AArch64DB::DBarrierMapper(); - StringRef Name = - Mapper.toString(MCE->getValue(), getSTI().getFeatureBits(), Valid); - Operands.push_back( AArch64Operand::CreateBarrier(MCE->getValue(), Name, - ExprLoc, getContext())); + auto DB = AArch64DB::lookupDBByEncoding(MCE->getValue()); + Operands.push_back(AArch64Operand::CreateBarrier( + MCE->getValue(), DB ? DB->Name : "", ExprLoc, getContext())); return MatchOperand_Success; } @@ -2776,23 +2750,20 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) { return MatchOperand_ParseFail; } - bool Valid; - auto Mapper = AArch64DB::DBarrierMapper(); - unsigned Opt = - Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid); - if (!Valid) { + auto DB = AArch64DB::lookupDBByName(Tok.getString()); + if (!DB) { TokError("invalid barrier option name"); return MatchOperand_ParseFail; } // The only valid named option for ISB is 'sy' - if (Mnemonic == "isb" && Opt != AArch64DB::SY) { + if (Mnemonic == "isb" && DB->Encoding != AArch64DB::sy) { TokError("'sy' or #imm operand expected"); return MatchOperand_ParseFail; } - Operands.push_back( AArch64Operand::CreateBarrier(Opt, Tok.getString(), - getLoc(), getContext())); + Operands.push_back(AArch64Operand::CreateBarrier( + DB->Encoding, Tok.getString(), getLoc(), getContext())); Parser.Lex(); // Consume the option return MatchOperand_Success; @@ -2806,28 +2777,22 @@ AArch64AsmParser::tryParseSysReg(OperandVector &Operands) { if (Tok.isNot(AsmToken::Identifier)) return MatchOperand_NoMatch; - bool IsKnown; - auto MRSMapper = AArch64SysReg::MRSMapper(); - uint32_t MRSReg = MRSMapper.fromString(Tok.getString(), - getSTI().getFeatureBits(), IsKnown); - assert(IsKnown == (MRSReg != -1U) && - "register should be -1 if and only if it's unknown"); - - auto MSRMapper = AArch64SysReg::MSRMapper(); - uint32_t MSRReg = MSRMapper.fromString(Tok.getString(), - getSTI().getFeatureBits(), IsKnown); - assert(IsKnown == (MSRReg != -1U) && - "register should be -1 if and only if it's unknown"); - - auto PStateMapper = AArch64PState::PStateMapper(); - uint32_t PStateField = - PStateMapper.fromString(Tok.getString(), - getSTI().getFeatureBits(), IsKnown); - assert(IsKnown == (PStateField != -1U) && - "register should be -1 if and only if it's unknown"); - - Operands.push_back(AArch64Operand::CreateSysReg( - Tok.getString(), getLoc(), MRSReg, MSRReg, PStateField, getContext())); + int MRSReg, MSRReg; + auto SysReg = AArch64SysReg::lookupSysRegByName(Tok.getString()); + if (SysReg && SysReg->haveFeatures(getSTI().getFeatureBits())) { + MRSReg = SysReg->Readable ? SysReg->Encoding : -1; + MSRReg = SysReg->Writeable ? SysReg->Encoding : -1; + } else + MRSReg = MSRReg = AArch64SysReg::parseGenericRegister(Tok.getString()); + + auto PState = AArch64PState::lookupPStateByName(Tok.getString()); + unsigned PStateImm = -1; + if (PState && PState->haveFeatures(getSTI().getFeatureBits())) + PStateImm = PState->Encoding; + + Operands.push_back( + AArch64Operand::CreateSysReg(Tok.getString(), getLoc(), MRSReg, MSRReg, + PStateImm, getContext())); Parser.Lex(); // Eat identifier return MatchOperand_Success; @@ -4195,6 +4160,10 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) { StringRef IDVal = DirectiveID.getIdentifier(); SMLoc Loc = DirectiveID.getLoc(); + if (IDVal == ".arch") + return parseDirectiveArch(Loc); + if (IDVal == ".cpu") + return parseDirectiveCPU(Loc); if (IDVal == ".hword") return parseDirectiveWord(2, Loc); if (IDVal == ".word") @@ -4216,6 +4185,99 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) { return parseDirectiveLOH(IDVal, Loc); } +static const struct { + const char *Name; + const FeatureBitset Features; +} ExtensionMap[] = { + { "crc", {AArch64::FeatureCRC} }, + { "crypto", {AArch64::FeatureCrypto} }, + { "fp", {AArch64::FeatureFPARMv8} }, + { "simd", {AArch64::FeatureNEON} }, + + // FIXME: Unsupported extensions + { "lse", {} }, + { "pan", {} }, + { "lor", {} }, + { "rdma", {} }, + { "profile", {} }, +}; + +/// parseDirectiveArch +/// ::= .arch token +bool AArch64AsmParser::parseDirectiveArch(SMLoc L) { + SMLoc ArchLoc = getLoc(); + + StringRef Arch, ExtensionString; + std::tie(Arch, ExtensionString) = + getParser().parseStringToEndOfStatement().trim().split('+'); + + unsigned ID = AArch64::parseArch(Arch); + if (ID == ARM::AK_INVALID) { + Error(ArchLoc, "unknown arch name"); + return false; + } + + MCSubtargetInfo &STI = copySTI(); + STI.setDefaultFeatures("", ""); + if (!ExtensionString.empty()) + STI.setDefaultFeatures("", ("+" + ExtensionString).str()); + setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + + return false; +} + +/// parseDirectiveCPU +/// ::= .cpu id +bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) { + SMLoc CPULoc = getLoc(); + + StringRef CPU, ExtensionString; + std::tie(CPU, ExtensionString) = + getParser().parseStringToEndOfStatement().trim().split('+'); + + SmallVector<StringRef, 4> RequestedExtensions; + if (!ExtensionString.empty()) + ExtensionString.split(RequestedExtensions, '+'); + + // FIXME This is using tablegen data, but should be moved to ARMTargetParser + // once that is tablegen'ed + if (!getSTI().isCPUStringValid(CPU)) { + Error(CPULoc, "unknown CPU name"); + return false; + } + + MCSubtargetInfo &STI = copySTI(); + STI.setDefaultFeatures(CPU, ""); + + FeatureBitset Features = STI.getFeatureBits(); + for (auto Name : RequestedExtensions) { + bool EnableFeature = true; + + if (Name.startswith_lower("no")) { + EnableFeature = false; + Name = Name.substr(2); + } + + for (const auto &Extension : ExtensionMap) { + if (Extension.Name != Name) + continue; + + if (Extension.Features.none()) + report_fatal_error("unsupported architectural extension: " + Name); + + FeatureBitset ToggleFeatures = EnableFeature + ? (~Features & Extension.Features) + : ( Features & Extension.Features); + uint64_t Features = + ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures)); + setAvailableFeatures(Features); + + break; + } + } + return false; +} + /// parseDirectiveWord /// ::= .word [ expression (, expression)* ] bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) { diff --git a/lib/Target/AArch64/AsmParser/Makefile b/lib/Target/AArch64/AsmParser/Makefile deleted file mode 100644 index 00268c76f8e83..0000000000000 --- a/lib/Target/AArch64/AsmParser/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -##===- lib/Target/AArch64/AsmParser/Makefile ---------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMAArch64AsmParser - -# Hack: we need to include 'main' ARM target directory to grab private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt index f26327ff84ad8..a79960ea96053 100644 --- a/lib/Target/AArch64/CMakeLists.txt +++ b/lib/Target/AArch64/CMakeLists.txt @@ -12,8 +12,25 @@ tablegen(LLVM AArch64GenFastISel.inc -gen-fast-isel) tablegen(LLVM AArch64GenCallingConv.inc -gen-callingconv) tablegen(LLVM AArch64GenSubtargetInfo.inc -gen-subtarget) tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler) +tablegen(LLVM AArch64GenSystemOperands.inc -gen-searchable-tables) + add_public_tablegen_target(AArch64CommonTableGen) +# List of all GlobalISel files. +set(GLOBAL_ISEL_FILES + AArch64CallLowering.cpp + AArch64RegisterBankInfo.cpp + ) + +# Add GlobalISel files to the dependencies if the user wants to build it. +if(LLVM_BUILD_GLOBAL_ISEL) + set(GLOBAL_ISEL_BUILD_FILES ${GLOBAL_ISEL_FILES}) +else() + set(GLOBAL_ISEL_BUILD_FILES"") + set(LLVM_OPTIONAL_SOURCES LLVMGlobalISel ${GLOBAL_ISEL_FILES}) +endif() + + add_llvm_target(AArch64CodeGen AArch64A57FPLoadBalancing.cpp AArch64AddressTypePromotion.cpp @@ -29,6 +46,7 @@ add_llvm_target(AArch64CodeGen AArch64A53Fix835769.cpp AArch64FrameLowering.cpp AArch64ConditionOptimizer.cpp + AArch64RedundantCopyElimination.cpp AArch64ISelDAGToDAG.cpp AArch64ISelLowering.cpp AArch64InstrInfo.cpp @@ -43,6 +61,7 @@ add_llvm_target(AArch64CodeGen AArch64TargetMachine.cpp AArch64TargetObjectFile.cpp AArch64TargetTransformInfo.cpp + ${GLOBAL_ISEL_BUILD_FILES} ) add_dependencies(LLVMAArch64CodeGen intrinsics_gen) diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index f1f968e73123e..fe6ea31b90613 100644 --- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -1523,13 +1523,12 @@ static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst, Inst.addOperand(MCOperand::createImm(pstate_field)); Inst.addOperand(MCOperand::createImm(crm)); - bool ValidNamed; - const AArch64Disassembler *Dis = + const AArch64Disassembler *Dis = static_cast<const AArch64Disassembler *>(Decoder); - (void)AArch64PState::PStateMapper().toString(pstate_field, - Dis->getSubtargetInfo().getFeatureBits(), ValidNamed); - - return ValidNamed ? Success : Fail; + auto PState = AArch64PState::lookupPStateByEncoding(pstate_field); + if (PState && PState->haveFeatures(Dis->getSubtargetInfo().getFeatureBits())) + return Success; + return Fail; } static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn, @@ -1574,7 +1573,7 @@ static DecodeStatus DecodeWSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, const void *Decoder) { - return DecodeGPRSeqPairsClassRegisterClass(Inst, + return DecodeGPRSeqPairsClassRegisterClass(Inst, AArch64::WSeqPairsClassRegClassID, RegNo, Addr, Decoder); } @@ -1583,7 +1582,7 @@ static DecodeStatus DecodeXSeqPairsClassRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, const void *Decoder) { - return DecodeGPRSeqPairsClassRegisterClass(Inst, + return DecodeGPRSeqPairsClassRegisterClass(Inst, AArch64::XSeqPairsClassRegClassID, RegNo, Addr, Decoder); } diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h index 7fb57adfeebaa..e475e505e7d12 100644 --- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h +++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h @@ -13,7 +13,7 @@ #ifndef LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H #define LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H -#include "llvm/MC/MCDisassembler.h" +#include "llvm/MC/MCDisassembler/MCDisassembler.h" namespace llvm { diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp index 82bc949927ce4..19d0ba2e1c415 100644 --- a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp +++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp @@ -134,9 +134,11 @@ bool AArch64ExternalSymbolizer::tryAddingSymbolicOperand( if (ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr) CommentStream << "literal pool symbol address: " << ReferenceName; else if (ReferenceType == - LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr) - CommentStream << "literal pool for: \"" << ReferenceName << "\""; - else if (ReferenceType == + LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr) { + CommentStream << "literal pool for: \""; + CommentStream.write_escaped(ReferenceName); + CommentStream << "\""; + } else if (ReferenceType == LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref) CommentStream << "Objc cfstring ref: @\"" << ReferenceName << "\""; else if (ReferenceType == diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h index 12b8450b13c66..49e8449637971 100644 --- a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h +++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h @@ -14,7 +14,7 @@ #ifndef LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64EXTERNALSYMBOLIZER_H #define LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64EXTERNALSYMBOLIZER_H -#include "llvm/MC/MCExternalSymbolizer.h" +#include "llvm/MC/MCDisassembler/MCExternalSymbolizer.h" namespace llvm { diff --git a/lib/Target/AArch64/Disassembler/Makefile b/lib/Target/AArch64/Disassembler/Makefile deleted file mode 100644 index 741bb817a6334..0000000000000 --- a/lib/Target/AArch64/Disassembler/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -##===- lib/Target/AArch64/Disassembler/Makefile ------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## - -LEVEL = ../../../.. -LIBRARYNAME = LLVMAArch64Disassembler - -# Hack: we need to include 'main' arm target directory to grab private headers -CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp index d8a8108243705..b4f85204714f1 100644 --- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp +++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp @@ -219,6 +219,54 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O, return; } + // MOVZ, MOVN and "ORR wzr, #imm" instructions are aliases for MOV, but their + // domains overlap so they need to be prioritized. The chain is "MOVZ lsl #0 > + // MOVZ lsl #N > MOVN lsl #0 > MOVN lsl #N > ORR". The highest instruction + // that can represent the move is the MOV alias, and the rest get printed + // normally. + if ((Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi) && + MI->getOperand(1).isImm() && MI->getOperand(2).isImm()) { + int RegWidth = Opcode == AArch64::MOVZXi ? 64 : 32; + int Shift = MI->getOperand(2).getImm(); + uint64_t Value = (uint64_t)MI->getOperand(1).getImm() << Shift; + + if (AArch64_AM::isMOVZMovAlias(Value, Shift, + Opcode == AArch64::MOVZXi ? 64 : 32)) { + O << "\tmov\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #" + << formatImm(SignExtend64(Value, RegWidth)); + return; + } + } + + if ((Opcode == AArch64::MOVNXi || Opcode == AArch64::MOVNWi) && + MI->getOperand(1).isImm() && MI->getOperand(2).isImm()) { + int RegWidth = Opcode == AArch64::MOVNXi ? 64 : 32; + int Shift = MI->getOperand(2).getImm(); + uint64_t Value = ~((uint64_t)MI->getOperand(1).getImm() << Shift); + if (RegWidth == 32) + Value = Value & 0xffffffff; + + if (AArch64_AM::isMOVNMovAlias(Value, Shift, RegWidth)) { + O << "\tmov\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #" + << formatImm(SignExtend64(Value, RegWidth)); + return; + } + } + + if ((Opcode == AArch64::ORRXri || Opcode == AArch64::ORRWri) && + (MI->getOperand(1).getReg() == AArch64::XZR || + MI->getOperand(1).getReg() == AArch64::WZR) && + MI->getOperand(2).isImm()) { + int RegWidth = Opcode == AArch64::ORRXri ? 64 : 32; + uint64_t Value = AArch64_AM::decodeLogicalImmediate( + MI->getOperand(2).getImm(), RegWidth); + if (!AArch64_AM::isAnyMOVWMovAlias(Value, RegWidth)) { + O << "\tmov\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #" + << formatImm(SignExtend64(Value, RegWidth)); + return; + } + } + if (!printAliasInstr(MI, STI, O)) printInstruction(MI, STI, O); @@ -928,14 +976,21 @@ void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo, unsigned Reg = Op.getReg(); O << getRegisterName(Reg); } else if (Op.isImm()) { - O << '#' << Op.getImm(); + printImm(MI, OpNo, STI, O); } else { assert(Op.isExpr() && "unknown operand kind in printOperand"); Op.getExpr()->print(O, &MAI); } } -void AArch64InstPrinter::printHexImm(const MCInst *MI, unsigned OpNo, +void AArch64InstPrinter::printImm(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + O << "#" << formatImm(Op.getImm()); +} + +void AArch64InstPrinter::printImmHex(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNo); @@ -981,12 +1036,12 @@ void AArch64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum, assert(Val == MO.getImm() && "Add/sub immediate out of range!"); unsigned Shift = AArch64_AM::getShiftValue(MI->getOperand(OpNum + 1).getImm()); - O << '#' << Val; + O << '#' << formatImm(Val); if (Shift != 0) printShifter(MI, OpNum + 1, STI, O); if (CommentStream) - *CommentStream << '=' << (Val << Shift) << '\n'; + *CommentStream << '=' << formatImm(Val << Shift) << '\n'; } else { assert(MO.isExpr() && "Unexpected operand type!"); MO.getExpr()->print(O, &MAI); @@ -1104,14 +1159,14 @@ template<int Scale> void AArch64InstPrinter::printImmScale(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { - O << '#' << Scale * MI->getOperand(OpNum).getImm(); + O << '#' << formatImm(Scale * MI->getOperand(OpNum).getImm()); } void AArch64InstPrinter::printUImm12Offset(const MCInst *MI, unsigned OpNum, unsigned Scale, raw_ostream &O) { const MCOperand MO = MI->getOperand(OpNum); if (MO.isImm()) { - O << "#" << (MO.getImm() * Scale); + O << "#" << formatImm(MO.getImm() * Scale); } else { assert(MO.isExpr() && "Unexpected operand type!"); MO.getExpr()->print(O, &MAI); @@ -1123,7 +1178,7 @@ void AArch64InstPrinter::printAMIndexedWB(const MCInst *MI, unsigned OpNum, const MCOperand MO1 = MI->getOperand(OpNum + 1); O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()); if (MO1.isImm()) { - O << ", #" << (MO1.getImm() * Scale); + O << ", #" << formatImm(MO1.getImm() * Scale); } else { assert(MO1.isExpr() && "Unexpected operand type!"); O << ", "; @@ -1136,26 +1191,22 @@ void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { unsigned prfop = MI->getOperand(OpNum).getImm(); - bool Valid; - StringRef Name = - AArch64PRFM::PRFMMapper().toString(prfop, STI.getFeatureBits(), Valid); - if (Valid) - O << Name; + auto PRFM = AArch64PRFM::lookupPRFMByEncoding(prfop); + if (PRFM) + O << PRFM->Name; else - O << '#' << prfop; + O << '#' << formatImm(prfop); } void AArch64InstPrinter::printPSBHintOp(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { unsigned psbhintop = MI->getOperand(OpNum).getImm(); - bool Valid; - StringRef Name = - AArch64PSBHint::PSBHintMapper().toString(psbhintop, STI.getFeatureBits(), Valid); - if (Valid) - O << Name; + auto PSB = AArch64PSBHint::lookupPSBByEncoding(psbhintop); + if (PSB) + O << PSB->Name; else - O << '#' << psbhintop; + O << '#' << formatImm(psbhintop); } void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum, @@ -1310,7 +1361,7 @@ void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum, // If the label has already been resolved to an immediate offset (say, when // we're running the disassembler), just print the immediate. if (Op.isImm()) { - O << "#" << (Op.getImm() * 4); + O << "#" << formatImm(Op.getImm() * 4); return; } @@ -1335,7 +1386,7 @@ void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum, // If the label has already been resolved to an immediate offset (say, when // we're running the disassembler), just print the immediate. if (Op.isImm()) { - O << "#" << (Op.getImm() * (1 << 12)); + O << "#" << formatImm(Op.getImm() * (1 << 12)); return; } @@ -1349,15 +1400,15 @@ void AArch64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo, unsigned Val = MI->getOperand(OpNo).getImm(); unsigned Opcode = MI->getOpcode(); - bool Valid; StringRef Name; - if (Opcode == AArch64::ISB) - Name = AArch64ISB::ISBMapper().toString(Val, STI.getFeatureBits(), - Valid); - else - Name = AArch64DB::DBarrierMapper().toString(Val, STI.getFeatureBits(), - Valid); - if (Valid) + if (Opcode == AArch64::ISB) { + auto ISB = AArch64ISB::lookupISBByEncoding(Val); + Name = ISB ? ISB->Name : ""; + } else { + auto DB = AArch64DB::lookupDBByEncoding(Val); + Name = DB ? DB->Name : ""; + } + if (!Name.empty()) O << Name; else O << "#" << Val; @@ -1368,10 +1419,19 @@ void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo, raw_ostream &O) { unsigned Val = MI->getOperand(OpNo).getImm(); - auto Mapper = AArch64SysReg::MRSMapper(); - std::string Name = Mapper.toString(Val, STI.getFeatureBits()); + // Horrible hack for the one register that has identical encodings but + // different names in MSR and MRS. Because of this, one of MRS and MSR is + // going to get the wrong entry + if (Val == AArch64SysReg::DBGDTRRX_EL0) { + O << "DBGDTRRX_EL0"; + return; + } - O << StringRef(Name).upper(); + const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val); + if (Reg && Reg->Readable && Reg->haveFeatures(STI.getFeatureBits())) + O << Reg->Name; + else + O << AArch64SysReg::genericRegisterString(Val); } void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo, @@ -1379,10 +1439,19 @@ void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo, raw_ostream &O) { unsigned Val = MI->getOperand(OpNo).getImm(); - auto Mapper = AArch64SysReg::MSRMapper(); - std::string Name = Mapper.toString(Val, STI.getFeatureBits()); + // Horrible hack for the one register that has identical encodings but + // different names in MSR and MRS. Because of this, one of MRS and MSR is + // going to get the wrong entry + if (Val == AArch64SysReg::DBGDTRTX_EL0) { + O << "DBGDTRTX_EL0"; + return; + } - O << StringRef(Name).upper(); + const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val); + if (Reg && Reg->Writeable && Reg->haveFeatures(STI.getFeatureBits())) + O << Reg->Name; + else + O << AArch64SysReg::genericRegisterString(Val); } void AArch64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo, @@ -1390,13 +1459,11 @@ void AArch64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo, raw_ostream &O) { unsigned Val = MI->getOperand(OpNo).getImm(); - bool Valid; - StringRef Name = - AArch64PState::PStateMapper().toString(Val, STI.getFeatureBits(), Valid); - if (Valid) - O << Name.upper(); + auto PState = AArch64PState::lookupPStateByEncoding(Val); + if (PState && PState->haveFeatures(STI.getFeatureBits())) + O << PState->Name; else - O << "#" << Val; + O << "#" << formatImm(Val); } void AArch64InstPrinter::printSIMDType10Operand(const MCInst *MI, unsigned OpNo, diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h index ea68d9848b427..65dca99ed04e7 100644 --- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h +++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h @@ -49,7 +49,9 @@ protected: // Operand printers void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); - void printHexImm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + void printImm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printImmHex(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm, raw_ostream &O); diff --git a/lib/Target/AArch64/InstPrinter/Makefile b/lib/Target/AArch64/InstPrinter/Makefile deleted file mode 100644 index b17e8d080119b..0000000000000 --- a/lib/Target/AArch64/InstPrinter/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -##===- lib/Target/AArch64/AsmPrinter/Makefile --------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMAArch64AsmPrinter - -# Hack: we need to include 'main' arm target directory to grab private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/AArch64/LLVMBuild.txt b/lib/Target/AArch64/LLVMBuild.txt index 642c18394a67e..0196c505ba3cc 100644 --- a/lib/Target/AArch64/LLVMBuild.txt +++ b/lib/Target/AArch64/LLVMBuild.txt @@ -31,5 +31,5 @@ has_jit = 1 type = Library name = AArch64CodeGen parent = AArch64 -required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info AArch64Utils Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target +required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info AArch64Utils Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target GlobalISel add_to_library_groups = AArch64 diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h index 648b1dfc8c5ef..3e5ef4df47060 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h @@ -753,6 +753,49 @@ static inline uint64_t decodeAdvSIMDModImmType12(uint8_t Imm) { return (EncVal << 32) | EncVal; } +inline static bool isAnyMOVZMovAlias(uint64_t Value, int RegWidth) { + for (int Shift = 0; Shift <= RegWidth - 16; Shift += 16) + if ((Value & ~(0xffffULL << Shift)) == 0) + return true; + + return false; +} + +inline static bool isMOVZMovAlias(uint64_t Value, int Shift, int RegWidth) { + if (RegWidth == 32) + Value &= 0xffffffffULL; + + // "lsl #0" takes precedence: in practice this only affects "#0, lsl #0". + if (Value == 0 && Shift != 0) + return false; + + return (Value & ~(0xffffULL << Shift)) == 0; +} + +inline static bool isMOVNMovAlias(uint64_t Value, int Shift, int RegWidth) { + // MOVZ takes precedence over MOVN. + if (isAnyMOVZMovAlias(Value, RegWidth)) + return false; + + Value = ~Value; + if (RegWidth == 32) + Value &= 0xffffffffULL; + + return isMOVZMovAlias(Value, Shift, RegWidth); +} + +inline static bool isAnyMOVWMovAlias(uint64_t Value, int RegWidth) { + if (isAnyMOVZMovAlias(Value, RegWidth)) + return true; + + // It's not a MOVZ, but it might be a MOVN. + Value = ~Value; + if (RegWidth == 32) + Value &= 0xffffffffULL; + + return isAnyMOVZMovAlias(Value, RegWidth); +} + } // end namespace AArch64_AM } // end namespace llvm diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index 7624c7240d688..27993246eb07c 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -12,6 +12,7 @@ #include "MCTargetDesc/AArch64FixupKinds.h" #include "llvm/ADT/Triple.h" #include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCDirectives.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCFixupKindInfo.h" @@ -28,9 +29,12 @@ namespace { class AArch64AsmBackend : public MCAsmBackend { static const unsigned PCRelFlagVal = MCFixupKindInfo::FKF_IsAlignedDownTo32Bits | MCFixupKindInfo::FKF_IsPCRel; +public: + bool IsLittleEndian; public: - AArch64AsmBackend(const Target &T) : MCAsmBackend() {} + AArch64AsmBackend(const Target &T, bool IsLittleEndian) + : MCAsmBackend(), IsLittleEndian(IsLittleEndian) {} unsigned getNumFixupKinds() const override { return AArch64::NumTargetFixupKinds; @@ -74,12 +78,15 @@ public: bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, const MCRelaxableFragment *DF, const MCAsmLayout &Layout) const override; - void relaxInstruction(const MCInst &Inst, MCInst &Res) const override; + void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + MCInst &Res) const override; bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override; void HandleAssemblerFlag(MCAssemblerFlag Flag) {} unsigned getPointerSize() const { return 8; } + + unsigned getFixupKindContainereSizeInBytes(unsigned Kind) const; }; } // end anonymous namespace @@ -129,14 +136,16 @@ static unsigned AdrImmBits(unsigned Value) { return (hi19 << 5) | (lo2 << 29); } -static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) { +static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, + MCContext *Ctx) { + unsigned Kind = Fixup.getKind(); int64_t SignedValue = static_cast<int64_t>(Value); switch (Kind) { default: llvm_unreachable("Unknown fixup kind!"); case AArch64::fixup_aarch64_pcrel_adr_imm21: - if (SignedValue > 2097151 || SignedValue < -2097152) - report_fatal_error("fixup value out of range"); + if (Ctx && (SignedValue > 2097151 || SignedValue < -2097152)) + Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); return AdrImmBits(Value & 0x1fffffULL); case AArch64::fixup_aarch64_pcrel_adrp_imm21: return AdrImmBits((Value & 0x1fffff000ULL) >> 12); @@ -144,54 +153,66 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) { case AArch64::fixup_aarch64_pcrel_branch19: // Signed 21-bit immediate if (SignedValue > 2097151 || SignedValue < -2097152) - report_fatal_error("fixup value out of range"); + if (Ctx) Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); + if (Ctx && (Value & 0x3)) + Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned"); // Low two bits are not encoded. return (Value >> 2) & 0x7ffff; case AArch64::fixup_aarch64_add_imm12: case AArch64::fixup_aarch64_ldst_imm12_scale1: // Unsigned 12-bit immediate - if (Value >= 0x1000) - report_fatal_error("invalid imm12 fixup value"); + if (Ctx && Value >= 0x1000) + Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); return Value; case AArch64::fixup_aarch64_ldst_imm12_scale2: // Unsigned 12-bit immediate which gets multiplied by 2 - if (Value & 1 || Value >= 0x2000) - report_fatal_error("invalid imm12 fixup value"); + if (Ctx && (Value >= 0x2000)) + Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); + if (Ctx && (Value & 0x1)) + Ctx->reportError(Fixup.getLoc(), "fixup must be 2-byte aligned"); return Value >> 1; case AArch64::fixup_aarch64_ldst_imm12_scale4: // Unsigned 12-bit immediate which gets multiplied by 4 - if (Value & 3 || Value >= 0x4000) - report_fatal_error("invalid imm12 fixup value"); + if (Ctx && (Value >= 0x4000)) + Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); + if (Ctx && (Value & 0x3)) + Ctx->reportError(Fixup.getLoc(), "fixup must be 4-byte aligned"); return Value >> 2; case AArch64::fixup_aarch64_ldst_imm12_scale8: // Unsigned 12-bit immediate which gets multiplied by 8 - if (Value & 7 || Value >= 0x8000) - report_fatal_error("invalid imm12 fixup value"); + if (Ctx && (Value >= 0x8000)) + Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); + if (Ctx && (Value & 0x7)) + Ctx->reportError(Fixup.getLoc(), "fixup must be 8-byte aligned"); return Value >> 3; case AArch64::fixup_aarch64_ldst_imm12_scale16: // Unsigned 12-bit immediate which gets multiplied by 16 - if (Value & 15 || Value >= 0x10000) - report_fatal_error("invalid imm12 fixup value"); + if (Ctx && (Value >= 0x10000)) + Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); + if (Ctx && (Value & 0xf)) + Ctx->reportError(Fixup.getLoc(), "fixup must be 16-byte aligned"); return Value >> 4; case AArch64::fixup_aarch64_movw: - report_fatal_error("no resolvable MOVZ/MOVK fixups supported yet"); + if (Ctx) + Ctx->reportError(Fixup.getLoc(), + "no resolvable MOVZ/MOVK fixups supported yet"); return Value; case AArch64::fixup_aarch64_pcrel_branch14: // Signed 16-bit immediate - if (SignedValue > 32767 || SignedValue < -32768) - report_fatal_error("fixup value out of range"); + if (Ctx && (SignedValue > 32767 || SignedValue < -32768)) + Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); // Low two bits are not encoded (4-byte alignment assumed). - if (Value & 0x3) - report_fatal_error("fixup not sufficiently aligned"); + if (Ctx && (Value & 0x3)) + Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned"); return (Value >> 2) & 0x3fff; case AArch64::fixup_aarch64_pcrel_branch26: case AArch64::fixup_aarch64_pcrel_call26: // Signed 28-bit immediate - if (SignedValue > 134217727 || SignedValue < -134217728) - report_fatal_error("fixup value out of range"); + if (Ctx && (SignedValue > 134217727 || SignedValue < -134217728)) + Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); // Low two bits are not encoded (4-byte alignment assumed). - if (Value & 0x3) - report_fatal_error("fixup not sufficiently aligned"); + if (Ctx && (Value & 0x3)) + Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned"); return (Value >> 2) & 0x3ffffff; case FK_Data_1: case FK_Data_2: @@ -201,6 +222,45 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) { } } +/// getFixupKindContainereSizeInBytes - The number of bytes of the +/// container involved in big endian or 0 if the item is little endian +unsigned AArch64AsmBackend::getFixupKindContainereSizeInBytes(unsigned Kind) const { + if (IsLittleEndian) + return 0; + + switch (Kind) { + default: + llvm_unreachable("Unknown fixup kind!"); + + case FK_Data_1: + return 1; + case FK_Data_2: + return 2; + case FK_Data_4: + return 4; + case FK_Data_8: + return 8; + + case AArch64::fixup_aarch64_tlsdesc_call: + case AArch64::fixup_aarch64_movw: + case AArch64::fixup_aarch64_pcrel_branch14: + case AArch64::fixup_aarch64_add_imm12: + case AArch64::fixup_aarch64_ldst_imm12_scale1: + case AArch64::fixup_aarch64_ldst_imm12_scale2: + case AArch64::fixup_aarch64_ldst_imm12_scale4: + case AArch64::fixup_aarch64_ldst_imm12_scale8: + case AArch64::fixup_aarch64_ldst_imm12_scale16: + case AArch64::fixup_aarch64_ldr_pcrel_imm19: + case AArch64::fixup_aarch64_pcrel_branch19: + case AArch64::fixup_aarch64_pcrel_adr_imm21: + case AArch64::fixup_aarch64_pcrel_adrp_imm21: + case AArch64::fixup_aarch64_pcrel_branch26: + case AArch64::fixup_aarch64_pcrel_call26: + // Instructions are always little endian + return 0; + } +} + void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, uint64_t Value, bool IsPCRel) const { @@ -209,7 +269,7 @@ void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data, return; // Doesn't change encoding. MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind()); // Apply any target-specific value adjustments. - Value = adjustFixupValue(Fixup.getKind(), Value); + Value = adjustFixupValue(Fixup, Value, nullptr); // Shift the value into position. Value <<= Info.TargetOffset; @@ -217,10 +277,25 @@ void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data, unsigned Offset = Fixup.getOffset(); assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!"); + // Used to point to big endian bytes. + unsigned FulleSizeInBytes = getFixupKindContainereSizeInBytes(Fixup.getKind()); + // For each byte of the fragment that the fixup touches, mask in the // bits from the fixup value. - for (unsigned i = 0; i != NumBytes; ++i) - Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff); + if (FulleSizeInBytes == 0) { + // Handle as little-endian + for (unsigned i = 0; i != NumBytes; ++i) { + Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff); + } + } else { + // Handle as big-endian + assert((Offset + FulleSizeInBytes) <= DataSize && "Invalid fixup size!"); + assert(NumBytes <= FulleSizeInBytes && "Invalid fixup size!"); + for (unsigned i = 0; i != NumBytes; ++i) { + unsigned Idx = FulleSizeInBytes - 1 - i; + Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff); + } + } } bool AArch64AsmBackend::mayNeedRelaxation(const MCInst &Inst) const { @@ -239,6 +314,7 @@ bool AArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, } void AArch64AsmBackend::relaxInstruction(const MCInst &Inst, + const MCSubtargetInfo &STI, MCInst &Res) const { llvm_unreachable("AArch64AsmBackend::relaxInstruction() unimplemented"); } @@ -264,14 +340,14 @@ namespace CU { enum CompactUnwindEncodings { /// \brief A "frameless" leaf function, where no non-volatile registers are /// saved. The return remains in LR throughout the function. - UNWIND_AArch64_MODE_FRAMELESS = 0x02000000, + UNWIND_ARM64_MODE_FRAMELESS = 0x02000000, /// \brief No compact unwind encoding available. Instead the low 23-bits of /// the compact unwind encoding is the offset of the DWARF FDE in the /// __eh_frame section. This mode is never used in object files. It is only /// generated by the linker in final linked images, which have only DWARF info /// for a function. - UNWIND_AArch64_MODE_DWARF = 0x03000000, + UNWIND_ARM64_MODE_DWARF = 0x03000000, /// \brief This is a standard arm64 prologue where FP/LR are immediately /// pushed on the stack, then SP is copied to FP. If there are any @@ -279,18 +355,18 @@ enum CompactUnwindEncodings { /// in a contiguous ranger right below the saved FP/LR pair. Any subset of the /// five X pairs and four D pairs can be saved, but the memory layout must be /// in register number order. - UNWIND_AArch64_MODE_FRAME = 0x04000000, + UNWIND_ARM64_MODE_FRAME = 0x04000000, /// \brief Frame register pair encodings. - UNWIND_AArch64_FRAME_X19_X20_PAIR = 0x00000001, - UNWIND_AArch64_FRAME_X21_X22_PAIR = 0x00000002, - UNWIND_AArch64_FRAME_X23_X24_PAIR = 0x00000004, - UNWIND_AArch64_FRAME_X25_X26_PAIR = 0x00000008, - UNWIND_AArch64_FRAME_X27_X28_PAIR = 0x00000010, - UNWIND_AArch64_FRAME_D8_D9_PAIR = 0x00000100, - UNWIND_AArch64_FRAME_D10_D11_PAIR = 0x00000200, - UNWIND_AArch64_FRAME_D12_D13_PAIR = 0x00000400, - UNWIND_AArch64_FRAME_D14_D15_PAIR = 0x00000800 + UNWIND_ARM64_FRAME_X19_X20_PAIR = 0x00000001, + UNWIND_ARM64_FRAME_X21_X22_PAIR = 0x00000002, + UNWIND_ARM64_FRAME_X23_X24_PAIR = 0x00000004, + UNWIND_ARM64_FRAME_X25_X26_PAIR = 0x00000008, + UNWIND_ARM64_FRAME_X27_X28_PAIR = 0x00000010, + UNWIND_ARM64_FRAME_D8_D9_PAIR = 0x00000100, + UNWIND_ARM64_FRAME_D10_D11_PAIR = 0x00000200, + UNWIND_ARM64_FRAME_D12_D13_PAIR = 0x00000400, + UNWIND_ARM64_FRAME_D14_D15_PAIR = 0x00000800 }; } // end CU namespace @@ -300,7 +376,7 @@ class DarwinAArch64AsmBackend : public AArch64AsmBackend { const MCRegisterInfo &MRI; /// \brief Encode compact unwind stack adjustment for frameless functions. - /// See UNWIND_AArch64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h. + /// See UNWIND_ARM64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h. /// The stack size always needs to be 16 byte aligned. uint32_t encodeStackAdjustment(uint32_t StackSize) const { return (StackSize / 16) << 12; @@ -308,7 +384,7 @@ class DarwinAArch64AsmBackend : public AArch64AsmBackend { public: DarwinAArch64AsmBackend(const Target &T, const MCRegisterInfo &MRI) - : AArch64AsmBackend(T), MRI(MRI) {} + : AArch64AsmBackend(T, /*IsLittleEndian*/true), MRI(MRI) {} MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { return createAArch64MachObjectWriter(OS, MachO::CPU_TYPE_ARM64, @@ -319,7 +395,7 @@ public: uint32_t generateCompactUnwindEncoding( ArrayRef<MCCFIInstruction> Instrs) const override { if (Instrs.empty()) - return CU::UNWIND_AArch64_MODE_FRAMELESS; + return CU::UNWIND_ARM64_MODE_FRAMELESS; bool HasFP = false; unsigned StackSize = 0; @@ -331,7 +407,7 @@ public: switch (Inst.getOperation()) { default: // Cannot handle this directive: bail out. - return CU::UNWIND_AArch64_MODE_DWARF; + return CU::UNWIND_ARM64_MODE_DWARF; case MCCFIInstruction::OpDefCfa: { // Defines a frame pointer. assert(getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true)) == @@ -356,7 +432,7 @@ public: "Pushing invalid registers for frame!"); // Indicate that the function has a frame. - CompactUnwindEncoding |= CU::UNWIND_AArch64_MODE_FRAME; + CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAME; HasFP = true; break; } @@ -370,11 +446,11 @@ public: // `.cfi_offset' instructions with the appropriate registers specified. unsigned Reg1 = MRI.getLLVMRegNum(Inst.getRegister(), true); if (i + 1 == e) - return CU::UNWIND_AArch64_MODE_DWARF; + return CU::UNWIND_ARM64_MODE_DWARF; const MCCFIInstruction &Inst2 = Instrs[++i]; if (Inst2.getOperation() != MCCFIInstruction::OpOffset) - return CU::UNWIND_AArch64_MODE_DWARF; + return CU::UNWIND_ARM64_MODE_DWARF; unsigned Reg2 = MRI.getLLVMRegNum(Inst2.getRegister(), true); // N.B. The encodings must be in register number order, and the X @@ -390,19 +466,19 @@ public: if (Reg1 == AArch64::X19 && Reg2 == AArch64::X20 && (CompactUnwindEncoding & 0xF1E) == 0) - CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X19_X20_PAIR; + CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X19_X20_PAIR; else if (Reg1 == AArch64::X21 && Reg2 == AArch64::X22 && (CompactUnwindEncoding & 0xF1C) == 0) - CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X21_X22_PAIR; + CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X21_X22_PAIR; else if (Reg1 == AArch64::X23 && Reg2 == AArch64::X24 && (CompactUnwindEncoding & 0xF18) == 0) - CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X23_X24_PAIR; + CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X23_X24_PAIR; else if (Reg1 == AArch64::X25 && Reg2 == AArch64::X26 && (CompactUnwindEncoding & 0xF10) == 0) - CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X25_X26_PAIR; + CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X25_X26_PAIR; else if (Reg1 == AArch64::X27 && Reg2 == AArch64::X28 && (CompactUnwindEncoding & 0xF00) == 0) - CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X27_X28_PAIR; + CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X27_X28_PAIR; else { Reg1 = getDRegFromBReg(Reg1); Reg2 = getDRegFromBReg(Reg2); @@ -413,18 +489,18 @@ public: // D14/D15 pair = 0x00000800 if (Reg1 == AArch64::D8 && Reg2 == AArch64::D9 && (CompactUnwindEncoding & 0xE00) == 0) - CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D8_D9_PAIR; + CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D8_D9_PAIR; else if (Reg1 == AArch64::D10 && Reg2 == AArch64::D11 && (CompactUnwindEncoding & 0xC00) == 0) - CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D10_D11_PAIR; + CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D10_D11_PAIR; else if (Reg1 == AArch64::D12 && Reg2 == AArch64::D13 && (CompactUnwindEncoding & 0x800) == 0) - CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D12_D13_PAIR; + CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D12_D13_PAIR; else if (Reg1 == AArch64::D14 && Reg2 == AArch64::D15) - CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D14_D15_PAIR; + CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D14_D15_PAIR; else // A pair was pushed which we cannot handle. - return CU::UNWIND_AArch64_MODE_DWARF; + return CU::UNWIND_ARM64_MODE_DWARF; } break; @@ -436,9 +512,9 @@ public: // With compact unwind info we can only represent stack adjustments of up // to 65520 bytes. if (StackSize > 65520) - return CU::UNWIND_AArch64_MODE_DWARF; + return CU::UNWIND_ARM64_MODE_DWARF; - CompactUnwindEncoding |= CU::UNWIND_AArch64_MODE_FRAMELESS; + CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAMELESS; CompactUnwindEncoding |= encodeStackAdjustment(StackSize); } @@ -453,10 +529,9 @@ namespace { class ELFAArch64AsmBackend : public AArch64AsmBackend { public: uint8_t OSABI; - bool IsLittleEndian; ELFAArch64AsmBackend(const Target &T, uint8_t OSABI, bool IsLittleEndian) - : AArch64AsmBackend(T), OSABI(OSABI), IsLittleEndian(IsLittleEndian) {} + : AArch64AsmBackend(T, IsLittleEndian), OSABI(OSABI) {} MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { return createAArch64ELFObjectWriter(OS, OSABI, IsLittleEndian); @@ -466,9 +541,6 @@ public: const MCFixup &Fixup, const MCFragment *DF, const MCValue &Target, uint64_t &Value, bool &IsResolved) override; - - void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value, bool IsPCRel) const override; }; void ELFAArch64AsmBackend::processFixupValue( @@ -489,34 +561,14 @@ void ELFAArch64AsmBackend::processFixupValue( // to the linker -- a relocation! if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21) IsResolved = false; -} - -// Returns whether this fixup is based on an address in the .eh_frame section, -// and therefore should be byte swapped. -// FIXME: Should be replaced with something more principled. -static bool isByteSwappedFixup(const MCExpr *E) { - MCValue Val; - if (!E->evaluateAsRelocatable(Val, nullptr, nullptr)) - return false; - if (!Val.getSymA() || Val.getSymA()->getSymbol().isUndefined()) - return false; - - const MCSectionELF *SecELF = - dyn_cast<MCSectionELF>(&Val.getSymA()->getSymbol().getSection()); - return SecELF->getSectionName() == ".eh_frame"; + // Try to get the encoded value for the fixup as-if we're mapping it into + // the instruction. This allows adjustFixupValue() to issue a diagnostic + // if the value is invalid. + if (IsResolved) + (void)adjustFixupValue(Fixup, Value, &Asm.getContext()); } -void ELFAArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data, - unsigned DataSize, uint64_t Value, - bool IsPCRel) const { - // store fixups in .eh_frame section in big endian order - if (!IsLittleEndian && Fixup.getKind() == FK_Data_4) { - if (isByteSwappedFixup(Fixup.getValue())) - Value = ByteSwap_32(unsigned(Value)); - } - AArch64AsmBackend::applyFixup (Fixup, Data, DataSize, Value, IsPCRel); -} } MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T, diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp index 1f516d1db8968..4b4c4097b97b4 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -15,6 +15,7 @@ #include "MCTargetDesc/AArch64FixupKinds.h" #include "MCTargetDesc/AArch64MCExpr.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCValue.h" #include "llvm/Support/ErrorHandling.h" @@ -29,8 +30,8 @@ public: ~AArch64ELFObjectWriter() override; protected: - unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, - bool IsPCRel) const override; + unsigned getRelocType(MCContext &Ctx, const MCValue &Target, + const MCFixup &Fixup, bool IsPCRel) const override; private: }; @@ -43,9 +44,10 @@ AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI, AArch64ELFObjectWriter::~AArch64ELFObjectWriter() {} -unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target, - const MCFixup &Fixup, - bool IsPCRel) const { +unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, + const MCValue &Target, + const MCFixup &Fixup, + bool IsPCRel) const { AArch64MCExpr::VariantKind RefKind = static_cast<AArch64MCExpr::VariantKind>(Target.getRefKind()); AArch64MCExpr::VariantKind SymLoc = AArch64MCExpr::getSymbolLoc(RefKind); @@ -61,6 +63,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target, if (IsPCRel) { switch ((unsigned)Fixup.getKind()) { + case FK_Data_1: + Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported"); + return ELF::R_AARCH64_NONE; case FK_Data_2: return ELF::R_AARCH64_PREL16; case FK_Data_4: @@ -79,7 +84,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target, return ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21; if (SymLoc == AArch64MCExpr::VK_TLSDESC && !IsNC) return ELF::R_AARCH64_TLSDESC_ADR_PAGE21; - llvm_unreachable("invalid symbol kind for ADRP relocation"); + Ctx.reportError(Fixup.getLoc(), + "invalid symbol kind for ADRP relocation"); + return ELF::R_AARCH64_NONE; case AArch64::fixup_aarch64_pcrel_branch26: return ELF::R_AARCH64_JUMP26; case AArch64::fixup_aarch64_pcrel_call26: @@ -93,10 +100,14 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target, case AArch64::fixup_aarch64_pcrel_branch19: return ELF::R_AARCH64_CONDBR19; default: - llvm_unreachable("Unsupported pc-relative fixup kind"); + Ctx.reportError(Fixup.getLoc(), "Unsupported pc-relative fixup kind"); + return ELF::R_AARCH64_NONE; } } else { switch ((unsigned)Fixup.getKind()) { + case FK_Data_1: + Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported"); + return ELF::R_AARCH64_NONE; case FK_Data_2: return ELF::R_AARCH64_ABS16; case FK_Data_4: @@ -121,8 +132,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target, if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) return ELF::R_AARCH64_ADD_ABS_LO12_NC; - report_fatal_error("invalid fixup for add (uimm12) instruction"); - return 0; + Ctx.reportError(Fixup.getLoc(), + "invalid fixup for add (uimm12) instruction"); + return ELF::R_AARCH64_NONE; case AArch64::fixup_aarch64_ldst_imm12_scale1: if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) return ELF::R_AARCH64_LDST8_ABS_LO12_NC; @@ -135,8 +147,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target, if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC) return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC; - report_fatal_error("invalid fixup for 8-bit load/store instruction"); - return 0; + Ctx.reportError(Fixup.getLoc(), + "invalid fixup for 8-bit load/store instruction"); + return ELF::R_AARCH64_NONE; case AArch64::fixup_aarch64_ldst_imm12_scale2: if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) return ELF::R_AARCH64_LDST16_ABS_LO12_NC; @@ -149,8 +162,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target, if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC) return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC; - report_fatal_error("invalid fixup for 16-bit load/store instruction"); - return 0; + Ctx.reportError(Fixup.getLoc(), + "invalid fixup for 16-bit load/store instruction"); + return ELF::R_AARCH64_NONE; case AArch64::fixup_aarch64_ldst_imm12_scale4: if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) return ELF::R_AARCH64_LDST32_ABS_LO12_NC; @@ -163,8 +177,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target, if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC) return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC; - report_fatal_error("invalid fixup for 32-bit load/store instruction"); - return 0; + Ctx.reportError(Fixup.getLoc(), + "invalid fixup for 32-bit load/store instruction"); + return ELF::R_AARCH64_NONE; case AArch64::fixup_aarch64_ldst_imm12_scale8: if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) return ELF::R_AARCH64_LDST64_ABS_LO12_NC; @@ -183,14 +198,16 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target, if (SymLoc == AArch64MCExpr::VK_TLSDESC && IsNC) return ELF::R_AARCH64_TLSDESC_LD64_LO12_NC; - report_fatal_error("invalid fixup for 64-bit load/store instruction"); - return 0; + Ctx.reportError(Fixup.getLoc(), + "invalid fixup for 64-bit load/store instruction"); + return ELF::R_AARCH64_NONE; case AArch64::fixup_aarch64_ldst_imm12_scale16: if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) return ELF::R_AARCH64_LDST128_ABS_LO12_NC; - report_fatal_error("invalid fixup for 128-bit load/store instruction"); - return 0; + Ctx.reportError(Fixup.getLoc(), + "invalid fixup for 128-bit load/store instruction"); + return ELF::R_AARCH64_NONE; case AArch64::fixup_aarch64_movw: if (RefKind == AArch64MCExpr::VK_ABS_G3) return ELF::R_AARCH64_MOVW_UABS_G3; @@ -236,12 +253,14 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target, return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G1; if (RefKind == AArch64MCExpr::VK_GOTTPREL_G0_NC) return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC; - report_fatal_error("invalid fixup for movz/movk instruction"); - return 0; + Ctx.reportError(Fixup.getLoc(), + "invalid fixup for movz/movk instruction"); + return ELF::R_AARCH64_NONE; case AArch64::fixup_aarch64_tlsdesc_call: return ELF::R_AARCH64_TLSDESC_CALL; default: - llvm_unreachable("Unknown ELF relocation type"); + Ctx.reportError(Fixup.getLoc(), "Unknown ELF relocation type"); + return ELF::R_AARCH64_NONE; } } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp index 7d8e79bc63c87..7b9ff8fa05031 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp @@ -154,24 +154,6 @@ public: SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; - /// getSIMDShift64OpValue - Return the encoded value for the - // shift-by-immediate AdvSIMD instructions. - uint32_t getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const; - - uint32_t getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const; - - uint32_t getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const; - - uint32_t getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const; - unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue, const MCSubtargetInfo &STI) const; @@ -428,41 +410,6 @@ AArch64MCCodeEmitter::getVecShifterOpValue(const MCInst &MI, unsigned OpIdx, llvm_unreachable("Invalid value for vector shift amount!"); } -uint32_t -AArch64MCCodeEmitter::getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - assert(MO.isImm() && "Expected an immediate value for the shift amount!"); - return 64 - (MO.getImm()); -} - -uint32_t AArch64MCCodeEmitter::getSIMDShift64_32OpValue( - const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - assert(MO.isImm() && "Expected an immediate value for the shift amount!"); - return 64 - (MO.getImm() | 32); -} - -uint32_t -AArch64MCCodeEmitter::getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - assert(MO.isImm() && "Expected an immediate value for the shift amount!"); - return 32 - (MO.getImm() | 16); -} - -uint32_t -AArch64MCCodeEmitter::getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - assert(MO.isImm() && "Expected an immediate value for the shift amount!"); - return 16 - (MO.getImm() | 8); -} - /// getFixedPointScaleOpValue - Return the encoded value for the // FP-to-fixed-point scale factor. uint32_t AArch64MCCodeEmitter::getFixedPointScaleOpValue( diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp index 9f7bed0d3b125..7027806212084 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -15,7 +15,6 @@ #include "AArch64ELFStreamer.h" #include "AArch64MCAsmInfo.h" #include "InstPrinter/AArch64InstPrinter.h" -#include "llvm/MC/MCCodeGenInfo.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" @@ -72,10 +71,8 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI, return MAI; } -static MCCodeGenInfo *createAArch64MCCodeGenInfo(const Triple &TT, - Reloc::Model RM, - CodeModel::Model CM, - CodeGenOpt::Level OL) { +static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM, + CodeModel::Model &CM) { assert((TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()) && "Only expect Darwin and ELF targets"); @@ -89,19 +86,6 @@ static MCCodeGenInfo *createAArch64MCCodeGenInfo(const Triple &TT, else if (CM != CodeModel::Small && CM != CodeModel::Large) report_fatal_error( "Only small and large code models are allowed on AArch64"); - - // AArch64 Darwin is always PIC. - if (TT.isOSDarwin()) - RM = Reloc::PIC_; - // On ELF platforms the default static relocation model has a smart enough - // linker to cope with referencing external symbols defined in a shared - // library. Hence DynamicNoPIC doesn't need to be promoted to PIC. - else if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC) - RM = Reloc::Static; - - MCCodeGenInfo *X = new MCCodeGenInfo(); - X->initMCCodeGenInfo(RM, CM, OL); - return X; } static MCInstPrinter *createAArch64MCInstPrinter(const Triple &T, @@ -140,7 +124,7 @@ extern "C" void LLVMInitializeAArch64TargetMC() { RegisterMCAsmInfoFn X(*T, createAArch64MCAsmInfo); // Register the MC codegen info. - TargetRegistry::RegisterMCCodeGenInfo(*T, createAArch64MCCodeGenInfo); + TargetRegistry::registerMCAdjustCodeGenOpts(*T, adjustCodeGenOpts); // Register the MC instruction info. TargetRegistry::RegisterMCInstrInfo(*T, createAArch64MCInstrInfo); diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h index 342384437c6a4..39414cc0c6a52 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h @@ -15,7 +15,6 @@ #define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCTARGETDESC_H #include "llvm/Support/DataTypes.h" -#include <string> namespace llvm { class formatted_raw_ostream; diff --git a/lib/Target/AArch64/MCTargetDesc/Makefile b/lib/Target/AArch64/MCTargetDesc/Makefile deleted file mode 100644 index 5779ac5ac60a8..0000000000000 --- a/lib/Target/AArch64/MCTargetDesc/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -##===- lib/Target/AArch64/TargetDesc/Makefile --------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## - -LEVEL = ../../../.. -LIBRARYNAME = LLVMAArch64Desc - -# Hack: we need to include 'main' target directory to grab private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/AArch64/Makefile b/lib/Target/AArch64/Makefile deleted file mode 100644 index f356c58504131..0000000000000 --- a/lib/Target/AArch64/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -##===- lib/Target/AArch64/Makefile -------------------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## - -LEVEL = ../../.. -LIBRARYNAME = LLVMAArch64CodeGen -TARGET = AArch64 - -# Make sure that tblgen is run, first thing. -BUILT_SOURCES = AArch64GenRegisterInfo.inc AArch64GenInstrInfo.inc \ - AArch64GenAsmWriter.inc AArch64GenAsmWriter1.inc \ - AArch64GenDAGISel.inc \ - AArch64GenCallingConv.inc AArch64GenAsmMatcher.inc \ - AArch64GenSubtargetInfo.inc AArch64GenMCCodeEmitter.inc \ - AArch64GenFastISel.inc AArch64GenDisassemblerTables.inc \ - AArch64GenMCPseudoLowering.inc - -DIRS = TargetInfo InstPrinter AsmParser Disassembler MCTargetDesc Utils - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/AArch64/TargetInfo/Makefile b/lib/Target/AArch64/TargetInfo/Makefile deleted file mode 100644 index 9dc9aa4bccf7a..0000000000000 --- a/lib/Target/AArch64/TargetInfo/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -##===- lib/Target/AArch64/TargetInfo/Makefile --------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMAArch64Info - -# Hack: we need to include 'main' target directory to grab private headers -CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp index cde1c6df26084..e65ba1f2401d7 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp @@ -11,858 +11,84 @@ // //===----------------------------------------------------------------------===// #include "AArch64BaseInfo.h" -#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/Regex.h" using namespace llvm; -StringRef AArch64NamedImmMapper::toString(uint32_t Value, - const FeatureBitset& FeatureBits, bool &Valid) const { - for (unsigned i = 0; i < NumMappings; ++i) { - if (Mappings[i].isValueEqual(Value, FeatureBits)) { - Valid = true; - return Mappings[i].Name; - } +namespace llvm { + namespace AArch64AT { +#define GET_AT_IMPL +#include "AArch64GenSystemOperands.inc" } - - Valid = false; - return StringRef(); } -uint32_t AArch64NamedImmMapper::fromString(StringRef Name, - const FeatureBitset& FeatureBits, bool &Valid) const { - std::string LowerCaseName = Name.lower(); - for (unsigned i = 0; i < NumMappings; ++i) { - if (Mappings[i].isNameEqual(LowerCaseName, FeatureBits)) { - Valid = true; - return Mappings[i].Value; - } - } - Valid = false; - return -1; +namespace llvm { + namespace AArch64DB { +#define GET_DB_IMPL +#include "AArch64GenSystemOperands.inc" + } } -bool AArch64NamedImmMapper::validImm(uint32_t Value) const { - return Value < TooBigImm; +namespace llvm { + namespace AArch64DC { +#define GET_DC_IMPL +#include "AArch64GenSystemOperands.inc" + } } -const AArch64NamedImmMapper::Mapping AArch64AT::ATMapper::ATMappings[] = { - {"s1e1r", S1E1R, {}}, - {"s1e2r", S1E2R, {}}, - {"s1e3r", S1E3R, {}}, - {"s1e1w", S1E1W, {}}, - {"s1e2w", S1E2W, {}}, - {"s1e3w", S1E3W, {}}, - {"s1e0r", S1E0R, {}}, - {"s1e0w", S1E0W, {}}, - {"s12e1r", S12E1R, {}}, - {"s12e1w", S12E1W, {}}, - {"s12e0r", S12E0R, {}}, - {"s12e0w", S12E0W, {}}, -}; - -AArch64AT::ATMapper::ATMapper() - : AArch64NamedImmMapper(ATMappings, 0) {} - -const AArch64NamedImmMapper::Mapping AArch64DB::DBarrierMapper::DBarrierMappings[] = { - {"oshld", OSHLD, {}}, - {"oshst", OSHST, {}}, - {"osh", OSH, {}}, - {"nshld", NSHLD, {}}, - {"nshst", NSHST, {}}, - {"nsh", NSH, {}}, - {"ishld", ISHLD, {}}, - {"ishst", ISHST, {}}, - {"ish", ISH, {}}, - {"ld", LD, {}}, - {"st", ST, {}}, - {"sy", SY, {}} -}; - -AArch64DB::DBarrierMapper::DBarrierMapper() - : AArch64NamedImmMapper(DBarrierMappings, 16u) {} - -const AArch64NamedImmMapper::Mapping AArch64DC::DCMapper::DCMappings[] = { - {"zva", ZVA, {}}, - {"ivac", IVAC, {}}, - {"isw", ISW, {}}, - {"cvac", CVAC, {}}, - {"csw", CSW, {}}, - {"cvau", CVAU, {}}, - {"civac", CIVAC, {}}, - {"cisw", CISW, {}} -}; - -AArch64DC::DCMapper::DCMapper() - : AArch64NamedImmMapper(DCMappings, 0) {} - -const AArch64NamedImmMapper::Mapping AArch64IC::ICMapper::ICMappings[] = { - {"ialluis", IALLUIS, {}}, - {"iallu", IALLU, {}}, - {"ivau", IVAU, {}} -}; - -AArch64IC::ICMapper::ICMapper() - : AArch64NamedImmMapper(ICMappings, 0) {} - -const AArch64NamedImmMapper::Mapping AArch64ISB::ISBMapper::ISBMappings[] = { - {"sy", SY, {}}, -}; - -AArch64ISB::ISBMapper::ISBMapper() - : AArch64NamedImmMapper(ISBMappings, 16) {} - -const AArch64NamedImmMapper::Mapping AArch64PRFM::PRFMMapper::PRFMMappings[] = { - {"pldl1keep", PLDL1KEEP, {}}, - {"pldl1strm", PLDL1STRM, {}}, - {"pldl2keep", PLDL2KEEP, {}}, - {"pldl2strm", PLDL2STRM, {}}, - {"pldl3keep", PLDL3KEEP, {}}, - {"pldl3strm", PLDL3STRM, {}}, - {"plil1keep", PLIL1KEEP, {}}, - {"plil1strm", PLIL1STRM, {}}, - {"plil2keep", PLIL2KEEP, {}}, - {"plil2strm", PLIL2STRM, {}}, - {"plil3keep", PLIL3KEEP, {}}, - {"plil3strm", PLIL3STRM, {}}, - {"pstl1keep", PSTL1KEEP, {}}, - {"pstl1strm", PSTL1STRM, {}}, - {"pstl2keep", PSTL2KEEP, {}}, - {"pstl2strm", PSTL2STRM, {}}, - {"pstl3keep", PSTL3KEEP, {}}, - {"pstl3strm", PSTL3STRM, {}} -}; - -AArch64PRFM::PRFMMapper::PRFMMapper() - : AArch64NamedImmMapper(PRFMMappings, 32) {} - -const AArch64NamedImmMapper::Mapping AArch64PState::PStateMapper::PStateMappings[] = { - {"spsel", SPSel, {}}, - {"daifset", DAIFSet, {}}, - {"daifclr", DAIFClr, {}}, - - // v8.1a "Privileged Access Never" extension-specific PStates - {"pan", PAN, {AArch64::HasV8_1aOps}}, - - // v8.2a - {"uao", UAO, {AArch64::HasV8_2aOps}}, -}; - -AArch64PState::PStateMapper::PStateMapper() - : AArch64NamedImmMapper(PStateMappings, 0) {} - -const AArch64NamedImmMapper::Mapping AArch64PSBHint::PSBHintMapper::PSBHintMappings[] = { - // v8.2a "Statistical Profiling" extension-specific PSB operand - {"csync", CSync, {AArch64::FeatureSPE}}, -}; - -AArch64PSBHint::PSBHintMapper::PSBHintMapper() - : AArch64NamedImmMapper(PSBHintMappings, 0) {} - -const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSMappings[] = { - {"mdccsr_el0", MDCCSR_EL0, {}}, - {"dbgdtrrx_el0", DBGDTRRX_EL0, {}}, - {"mdrar_el1", MDRAR_EL1, {}}, - {"oslsr_el1", OSLSR_EL1, {}}, - {"dbgauthstatus_el1", DBGAUTHSTATUS_EL1, {}}, - {"pmceid0_el0", PMCEID0_EL0, {}}, - {"pmceid1_el0", PMCEID1_EL0, {}}, - {"midr_el1", MIDR_EL1, {}}, - {"ccsidr_el1", CCSIDR_EL1, {}}, - {"clidr_el1", CLIDR_EL1, {}}, - {"ctr_el0", CTR_EL0, {}}, - {"mpidr_el1", MPIDR_EL1, {}}, - {"revidr_el1", REVIDR_EL1, {}}, - {"aidr_el1", AIDR_EL1, {}}, - {"dczid_el0", DCZID_EL0, {}}, - {"id_pfr0_el1", ID_PFR0_EL1, {}}, - {"id_pfr1_el1", ID_PFR1_EL1, {}}, - {"id_dfr0_el1", ID_DFR0_EL1, {}}, - {"id_afr0_el1", ID_AFR0_EL1, {}}, - {"id_mmfr0_el1", ID_MMFR0_EL1, {}}, - {"id_mmfr1_el1", ID_MMFR1_EL1, {}}, - {"id_mmfr2_el1", ID_MMFR2_EL1, {}}, - {"id_mmfr3_el1", ID_MMFR3_EL1, {}}, - {"id_mmfr4_el1", ID_MMFR4_EL1, {}}, - {"id_isar0_el1", ID_ISAR0_EL1, {}}, - {"id_isar1_el1", ID_ISAR1_EL1, {}}, - {"id_isar2_el1", ID_ISAR2_EL1, {}}, - {"id_isar3_el1", ID_ISAR3_EL1, {}}, - {"id_isar4_el1", ID_ISAR4_EL1, {}}, - {"id_isar5_el1", ID_ISAR5_EL1, {}}, - {"id_aa64pfr0_el1", ID_A64PFR0_EL1, {}}, - {"id_aa64pfr1_el1", ID_A64PFR1_EL1, {}}, - {"id_aa64dfr0_el1", ID_A64DFR0_EL1, {}}, - {"id_aa64dfr1_el1", ID_A64DFR1_EL1, {}}, - {"id_aa64afr0_el1", ID_A64AFR0_EL1, {}}, - {"id_aa64afr1_el1", ID_A64AFR1_EL1, {}}, - {"id_aa64isar0_el1", ID_A64ISAR0_EL1, {}}, - {"id_aa64isar1_el1", ID_A64ISAR1_EL1, {}}, - {"id_aa64mmfr0_el1", ID_A64MMFR0_EL1, {}}, - {"id_aa64mmfr1_el1", ID_A64MMFR1_EL1, {}}, - {"id_aa64mmfr2_el1", ID_A64MMFR2_EL1, {AArch64::HasV8_2aOps}}, - {"mvfr0_el1", MVFR0_EL1, {}}, - {"mvfr1_el1", MVFR1_EL1, {}}, - {"mvfr2_el1", MVFR2_EL1, {}}, - {"rvbar_el1", RVBAR_EL1, {}}, - {"rvbar_el2", RVBAR_EL2, {}}, - {"rvbar_el3", RVBAR_EL3, {}}, - {"isr_el1", ISR_EL1, {}}, - {"cntpct_el0", CNTPCT_EL0, {}}, - {"cntvct_el0", CNTVCT_EL0, {}}, - - // Trace registers - {"trcstatr", TRCSTATR, {}}, - {"trcidr8", TRCIDR8, {}}, - {"trcidr9", TRCIDR9, {}}, - {"trcidr10", TRCIDR10, {}}, - {"trcidr11", TRCIDR11, {}}, - {"trcidr12", TRCIDR12, {}}, - {"trcidr13", TRCIDR13, {}}, - {"trcidr0", TRCIDR0, {}}, - {"trcidr1", TRCIDR1, {}}, - {"trcidr2", TRCIDR2, {}}, - {"trcidr3", TRCIDR3, {}}, - {"trcidr4", TRCIDR4, {}}, - {"trcidr5", TRCIDR5, {}}, - {"trcidr6", TRCIDR6, {}}, - {"trcidr7", TRCIDR7, {}}, - {"trcoslsr", TRCOSLSR, {}}, - {"trcpdsr", TRCPDSR, {}}, - {"trcdevaff0", TRCDEVAFF0, {}}, - {"trcdevaff1", TRCDEVAFF1, {}}, - {"trclsr", TRCLSR, {}}, - {"trcauthstatus", TRCAUTHSTATUS, {}}, - {"trcdevarch", TRCDEVARCH, {}}, - {"trcdevid", TRCDEVID, {}}, - {"trcdevtype", TRCDEVTYPE, {}}, - {"trcpidr4", TRCPIDR4, {}}, - {"trcpidr5", TRCPIDR5, {}}, - {"trcpidr6", TRCPIDR6, {}}, - {"trcpidr7", TRCPIDR7, {}}, - {"trcpidr0", TRCPIDR0, {}}, - {"trcpidr1", TRCPIDR1, {}}, - {"trcpidr2", TRCPIDR2, {}}, - {"trcpidr3", TRCPIDR3, {}}, - {"trccidr0", TRCCIDR0, {}}, - {"trccidr1", TRCCIDR1, {}}, - {"trccidr2", TRCCIDR2, {}}, - {"trccidr3", TRCCIDR3, {}}, - - // GICv3 registers - {"icc_iar1_el1", ICC_IAR1_EL1, {}}, - {"icc_iar0_el1", ICC_IAR0_EL1, {}}, - {"icc_hppir1_el1", ICC_HPPIR1_EL1, {}}, - {"icc_hppir0_el1", ICC_HPPIR0_EL1, {}}, - {"icc_rpr_el1", ICC_RPR_EL1, {}}, - {"ich_vtr_el2", ICH_VTR_EL2, {}}, - {"ich_eisr_el2", ICH_EISR_EL2, {}}, - {"ich_elsr_el2", ICH_ELSR_EL2, {}}, - - // v8.1a "Limited Ordering Regions" extension-specific system registers - {"lorid_el1", LORID_EL1, {AArch64::HasV8_1aOps}}, -}; - -AArch64SysReg::MRSMapper::MRSMapper() { - InstMappings = &MRSMappings[0]; - NumInstMappings = llvm::array_lengthof(MRSMappings); +namespace llvm { + namespace AArch64IC { +#define GET_IC_IMPL +#include "AArch64GenSystemOperands.inc" + } } -const AArch64NamedImmMapper::Mapping AArch64SysReg::MSRMapper::MSRMappings[] = { - {"dbgdtrtx_el0", DBGDTRTX_EL0, {}}, - {"oslar_el1", OSLAR_EL1, {}}, - {"pmswinc_el0", PMSWINC_EL0, {}}, - - // Trace registers - {"trcoslar", TRCOSLAR, {}}, - {"trclar", TRCLAR, {}}, - - // GICv3 registers - {"icc_eoir1_el1", ICC_EOIR1_EL1, {}}, - {"icc_eoir0_el1", ICC_EOIR0_EL1, {}}, - {"icc_dir_el1", ICC_DIR_EL1, {}}, - {"icc_sgi1r_el1", ICC_SGI1R_EL1, {}}, - {"icc_asgi1r_el1", ICC_ASGI1R_EL1, {}}, - {"icc_sgi0r_el1", ICC_SGI0R_EL1, {}}, -}; - -AArch64SysReg::MSRMapper::MSRMapper() { - InstMappings = &MSRMappings[0]; - NumInstMappings = llvm::array_lengthof(MSRMappings); +namespace llvm { + namespace AArch64ISB { +#define GET_ISB_IMPL +#include "AArch64GenSystemOperands.inc" + } +} +namespace llvm { + namespace AArch64PRFM { +#define GET_PRFM_IMPL +#include "AArch64GenSystemOperands.inc" + } } +namespace llvm { + namespace AArch64PState { +#define GET_PSTATE_IMPL +#include "AArch64GenSystemOperands.inc" + } +} -const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegMappings[] = { - {"osdtrrx_el1", OSDTRRX_EL1, {}}, - {"osdtrtx_el1", OSDTRTX_EL1, {}}, - {"teecr32_el1", TEECR32_EL1, {}}, - {"mdccint_el1", MDCCINT_EL1, {}}, - {"mdscr_el1", MDSCR_EL1, {}}, - {"dbgdtr_el0", DBGDTR_EL0, {}}, - {"oseccr_el1", OSECCR_EL1, {}}, - {"dbgvcr32_el2", DBGVCR32_EL2, {}}, - {"dbgbvr0_el1", DBGBVR0_EL1, {}}, - {"dbgbvr1_el1", DBGBVR1_EL1, {}}, - {"dbgbvr2_el1", DBGBVR2_EL1, {}}, - {"dbgbvr3_el1", DBGBVR3_EL1, {}}, - {"dbgbvr4_el1", DBGBVR4_EL1, {}}, - {"dbgbvr5_el1", DBGBVR5_EL1, {}}, - {"dbgbvr6_el1", DBGBVR6_EL1, {}}, - {"dbgbvr7_el1", DBGBVR7_EL1, {}}, - {"dbgbvr8_el1", DBGBVR8_EL1, {}}, - {"dbgbvr9_el1", DBGBVR9_EL1, {}}, - {"dbgbvr10_el1", DBGBVR10_EL1, {}}, - {"dbgbvr11_el1", DBGBVR11_EL1, {}}, - {"dbgbvr12_el1", DBGBVR12_EL1, {}}, - {"dbgbvr13_el1", DBGBVR13_EL1, {}}, - {"dbgbvr14_el1", DBGBVR14_EL1, {}}, - {"dbgbvr15_el1", DBGBVR15_EL1, {}}, - {"dbgbcr0_el1", DBGBCR0_EL1, {}}, - {"dbgbcr1_el1", DBGBCR1_EL1, {}}, - {"dbgbcr2_el1", DBGBCR2_EL1, {}}, - {"dbgbcr3_el1", DBGBCR3_EL1, {}}, - {"dbgbcr4_el1", DBGBCR4_EL1, {}}, - {"dbgbcr5_el1", DBGBCR5_EL1, {}}, - {"dbgbcr6_el1", DBGBCR6_EL1, {}}, - {"dbgbcr7_el1", DBGBCR7_EL1, {}}, - {"dbgbcr8_el1", DBGBCR8_EL1, {}}, - {"dbgbcr9_el1", DBGBCR9_EL1, {}}, - {"dbgbcr10_el1", DBGBCR10_EL1, {}}, - {"dbgbcr11_el1", DBGBCR11_EL1, {}}, - {"dbgbcr12_el1", DBGBCR12_EL1, {}}, - {"dbgbcr13_el1", DBGBCR13_EL1, {}}, - {"dbgbcr14_el1", DBGBCR14_EL1, {}}, - {"dbgbcr15_el1", DBGBCR15_EL1, {}}, - {"dbgwvr0_el1", DBGWVR0_EL1, {}}, - {"dbgwvr1_el1", DBGWVR1_EL1, {}}, - {"dbgwvr2_el1", DBGWVR2_EL1, {}}, - {"dbgwvr3_el1", DBGWVR3_EL1, {}}, - {"dbgwvr4_el1", DBGWVR4_EL1, {}}, - {"dbgwvr5_el1", DBGWVR5_EL1, {}}, - {"dbgwvr6_el1", DBGWVR6_EL1, {}}, - {"dbgwvr7_el1", DBGWVR7_EL1, {}}, - {"dbgwvr8_el1", DBGWVR8_EL1, {}}, - {"dbgwvr9_el1", DBGWVR9_EL1, {}}, - {"dbgwvr10_el1", DBGWVR10_EL1, {}}, - {"dbgwvr11_el1", DBGWVR11_EL1, {}}, - {"dbgwvr12_el1", DBGWVR12_EL1, {}}, - {"dbgwvr13_el1", DBGWVR13_EL1, {}}, - {"dbgwvr14_el1", DBGWVR14_EL1, {}}, - {"dbgwvr15_el1", DBGWVR15_EL1, {}}, - {"dbgwcr0_el1", DBGWCR0_EL1, {}}, - {"dbgwcr1_el1", DBGWCR1_EL1, {}}, - {"dbgwcr2_el1", DBGWCR2_EL1, {}}, - {"dbgwcr3_el1", DBGWCR3_EL1, {}}, - {"dbgwcr4_el1", DBGWCR4_EL1, {}}, - {"dbgwcr5_el1", DBGWCR5_EL1, {}}, - {"dbgwcr6_el1", DBGWCR6_EL1, {}}, - {"dbgwcr7_el1", DBGWCR7_EL1, {}}, - {"dbgwcr8_el1", DBGWCR8_EL1, {}}, - {"dbgwcr9_el1", DBGWCR9_EL1, {}}, - {"dbgwcr10_el1", DBGWCR10_EL1, {}}, - {"dbgwcr11_el1", DBGWCR11_EL1, {}}, - {"dbgwcr12_el1", DBGWCR12_EL1, {}}, - {"dbgwcr13_el1", DBGWCR13_EL1, {}}, - {"dbgwcr14_el1", DBGWCR14_EL1, {}}, - {"dbgwcr15_el1", DBGWCR15_EL1, {}}, - {"teehbr32_el1", TEEHBR32_EL1, {}}, - {"osdlr_el1", OSDLR_EL1, {}}, - {"dbgprcr_el1", DBGPRCR_EL1, {}}, - {"dbgclaimset_el1", DBGCLAIMSET_EL1, {}}, - {"dbgclaimclr_el1", DBGCLAIMCLR_EL1, {}}, - {"csselr_el1", CSSELR_EL1, {}}, - {"vpidr_el2", VPIDR_EL2, {}}, - {"vmpidr_el2", VMPIDR_EL2, {}}, - {"sctlr_el1", SCTLR_EL1, {}}, - {"sctlr_el2", SCTLR_EL2, {}}, - {"sctlr_el3", SCTLR_EL3, {}}, - {"actlr_el1", ACTLR_EL1, {}}, - {"actlr_el2", ACTLR_EL2, {}}, - {"actlr_el3", ACTLR_EL3, {}}, - {"cpacr_el1", CPACR_EL1, {}}, - {"hcr_el2", HCR_EL2, {}}, - {"scr_el3", SCR_EL3, {}}, - {"mdcr_el2", MDCR_EL2, {}}, - {"sder32_el3", SDER32_EL3, {}}, - {"cptr_el2", CPTR_EL2, {}}, - {"cptr_el3", CPTR_EL3, {}}, - {"hstr_el2", HSTR_EL2, {}}, - {"hacr_el2", HACR_EL2, {}}, - {"mdcr_el3", MDCR_EL3, {}}, - {"ttbr0_el1", TTBR0_EL1, {}}, - {"ttbr0_el2", TTBR0_EL2, {}}, - {"ttbr0_el3", TTBR0_EL3, {}}, - {"ttbr1_el1", TTBR1_EL1, {}}, - {"tcr_el1", TCR_EL1, {}}, - {"tcr_el2", TCR_EL2, {}}, - {"tcr_el3", TCR_EL3, {}}, - {"vttbr_el2", VTTBR_EL2, {}}, - {"vtcr_el2", VTCR_EL2, {}}, - {"dacr32_el2", DACR32_EL2, {}}, - {"spsr_el1", SPSR_EL1, {}}, - {"spsr_el2", SPSR_EL2, {}}, - {"spsr_el3", SPSR_EL3, {}}, - {"elr_el1", ELR_EL1, {}}, - {"elr_el2", ELR_EL2, {}}, - {"elr_el3", ELR_EL3, {}}, - {"sp_el0", SP_EL0, {}}, - {"sp_el1", SP_EL1, {}}, - {"sp_el2", SP_EL2, {}}, - {"spsel", SPSel, {}}, - {"nzcv", NZCV, {}}, - {"daif", DAIF, {}}, - {"currentel", CurrentEL, {}}, - {"spsr_irq", SPSR_irq, {}}, - {"spsr_abt", SPSR_abt, {}}, - {"spsr_und", SPSR_und, {}}, - {"spsr_fiq", SPSR_fiq, {}}, - {"fpcr", FPCR, {}}, - {"fpsr", FPSR, {}}, - {"dspsr_el0", DSPSR_EL0, {}}, - {"dlr_el0", DLR_EL0, {}}, - {"ifsr32_el2", IFSR32_EL2, {}}, - {"afsr0_el1", AFSR0_EL1, {}}, - {"afsr0_el2", AFSR0_EL2, {}}, - {"afsr0_el3", AFSR0_EL3, {}}, - {"afsr1_el1", AFSR1_EL1, {}}, - {"afsr1_el2", AFSR1_EL2, {}}, - {"afsr1_el3", AFSR1_EL3, {}}, - {"esr_el1", ESR_EL1, {}}, - {"esr_el2", ESR_EL2, {}}, - {"esr_el3", ESR_EL3, {}}, - {"fpexc32_el2", FPEXC32_EL2, {}}, - {"far_el1", FAR_EL1, {}}, - {"far_el2", FAR_EL2, {}}, - {"far_el3", FAR_EL3, {}}, - {"hpfar_el2", HPFAR_EL2, {}}, - {"par_el1", PAR_EL1, {}}, - {"pmcr_el0", PMCR_EL0, {}}, - {"pmcntenset_el0", PMCNTENSET_EL0, {}}, - {"pmcntenclr_el0", PMCNTENCLR_EL0, {}}, - {"pmovsclr_el0", PMOVSCLR_EL0, {}}, - {"pmselr_el0", PMSELR_EL0, {}}, - {"pmccntr_el0", PMCCNTR_EL0, {}}, - {"pmxevtyper_el0", PMXEVTYPER_EL0, {}}, - {"pmxevcntr_el0", PMXEVCNTR_EL0, {}}, - {"pmuserenr_el0", PMUSERENR_EL0, {}}, - {"pmintenset_el1", PMINTENSET_EL1, {}}, - {"pmintenclr_el1", PMINTENCLR_EL1, {}}, - {"pmovsset_el0", PMOVSSET_EL0, {}}, - {"mair_el1", MAIR_EL1, {}}, - {"mair_el2", MAIR_EL2, {}}, - {"mair_el3", MAIR_EL3, {}}, - {"amair_el1", AMAIR_EL1, {}}, - {"amair_el2", AMAIR_EL2, {}}, - {"amair_el3", AMAIR_EL3, {}}, - {"vbar_el1", VBAR_EL1, {}}, - {"vbar_el2", VBAR_EL2, {}}, - {"vbar_el3", VBAR_EL3, {}}, - {"rmr_el1", RMR_EL1, {}}, - {"rmr_el2", RMR_EL2, {}}, - {"rmr_el3", RMR_EL3, {}}, - {"contextidr_el1", CONTEXTIDR_EL1, {}}, - {"tpidr_el0", TPIDR_EL0, {}}, - {"tpidr_el2", TPIDR_EL2, {}}, - {"tpidr_el3", TPIDR_EL3, {}}, - {"tpidrro_el0", TPIDRRO_EL0, {}}, - {"tpidr_el1", TPIDR_EL1, {}}, - {"cntfrq_el0", CNTFRQ_EL0, {}}, - {"cntvoff_el2", CNTVOFF_EL2, {}}, - {"cntkctl_el1", CNTKCTL_EL1, {}}, - {"cnthctl_el2", CNTHCTL_EL2, {}}, - {"cntp_tval_el0", CNTP_TVAL_EL0, {}}, - {"cnthp_tval_el2", CNTHP_TVAL_EL2, {}}, - {"cntps_tval_el1", CNTPS_TVAL_EL1, {}}, - {"cntp_ctl_el0", CNTP_CTL_EL0, {}}, - {"cnthp_ctl_el2", CNTHP_CTL_EL2, {}}, - {"cntps_ctl_el1", CNTPS_CTL_EL1, {}}, - {"cntp_cval_el0", CNTP_CVAL_EL0, {}}, - {"cnthp_cval_el2", CNTHP_CVAL_EL2, {}}, - {"cntps_cval_el1", CNTPS_CVAL_EL1, {}}, - {"cntv_tval_el0", CNTV_TVAL_EL0, {}}, - {"cntv_ctl_el0", CNTV_CTL_EL0, {}}, - {"cntv_cval_el0", CNTV_CVAL_EL0, {}}, - {"pmevcntr0_el0", PMEVCNTR0_EL0, {}}, - {"pmevcntr1_el0", PMEVCNTR1_EL0, {}}, - {"pmevcntr2_el0", PMEVCNTR2_EL0, {}}, - {"pmevcntr3_el0", PMEVCNTR3_EL0, {}}, - {"pmevcntr4_el0", PMEVCNTR4_EL0, {}}, - {"pmevcntr5_el0", PMEVCNTR5_EL0, {}}, - {"pmevcntr6_el0", PMEVCNTR6_EL0, {}}, - {"pmevcntr7_el0", PMEVCNTR7_EL0, {}}, - {"pmevcntr8_el0", PMEVCNTR8_EL0, {}}, - {"pmevcntr9_el0", PMEVCNTR9_EL0, {}}, - {"pmevcntr10_el0", PMEVCNTR10_EL0, {}}, - {"pmevcntr11_el0", PMEVCNTR11_EL0, {}}, - {"pmevcntr12_el0", PMEVCNTR12_EL0, {}}, - {"pmevcntr13_el0", PMEVCNTR13_EL0, {}}, - {"pmevcntr14_el0", PMEVCNTR14_EL0, {}}, - {"pmevcntr15_el0", PMEVCNTR15_EL0, {}}, - {"pmevcntr16_el0", PMEVCNTR16_EL0, {}}, - {"pmevcntr17_el0", PMEVCNTR17_EL0, {}}, - {"pmevcntr18_el0", PMEVCNTR18_EL0, {}}, - {"pmevcntr19_el0", PMEVCNTR19_EL0, {}}, - {"pmevcntr20_el0", PMEVCNTR20_EL0, {}}, - {"pmevcntr21_el0", PMEVCNTR21_EL0, {}}, - {"pmevcntr22_el0", PMEVCNTR22_EL0, {}}, - {"pmevcntr23_el0", PMEVCNTR23_EL0, {}}, - {"pmevcntr24_el0", PMEVCNTR24_EL0, {}}, - {"pmevcntr25_el0", PMEVCNTR25_EL0, {}}, - {"pmevcntr26_el0", PMEVCNTR26_EL0, {}}, - {"pmevcntr27_el0", PMEVCNTR27_EL0, {}}, - {"pmevcntr28_el0", PMEVCNTR28_EL0, {}}, - {"pmevcntr29_el0", PMEVCNTR29_EL0, {}}, - {"pmevcntr30_el0", PMEVCNTR30_EL0, {}}, - {"pmccfiltr_el0", PMCCFILTR_EL0, {}}, - {"pmevtyper0_el0", PMEVTYPER0_EL0, {}}, - {"pmevtyper1_el0", PMEVTYPER1_EL0, {}}, - {"pmevtyper2_el0", PMEVTYPER2_EL0, {}}, - {"pmevtyper3_el0", PMEVTYPER3_EL0, {}}, - {"pmevtyper4_el0", PMEVTYPER4_EL0, {}}, - {"pmevtyper5_el0", PMEVTYPER5_EL0, {}}, - {"pmevtyper6_el0", PMEVTYPER6_EL0, {}}, - {"pmevtyper7_el0", PMEVTYPER7_EL0, {}}, - {"pmevtyper8_el0", PMEVTYPER8_EL0, {}}, - {"pmevtyper9_el0", PMEVTYPER9_EL0, {}}, - {"pmevtyper10_el0", PMEVTYPER10_EL0, {}}, - {"pmevtyper11_el0", PMEVTYPER11_EL0, {}}, - {"pmevtyper12_el0", PMEVTYPER12_EL0, {}}, - {"pmevtyper13_el0", PMEVTYPER13_EL0, {}}, - {"pmevtyper14_el0", PMEVTYPER14_EL0, {}}, - {"pmevtyper15_el0", PMEVTYPER15_EL0, {}}, - {"pmevtyper16_el0", PMEVTYPER16_EL0, {}}, - {"pmevtyper17_el0", PMEVTYPER17_EL0, {}}, - {"pmevtyper18_el0", PMEVTYPER18_EL0, {}}, - {"pmevtyper19_el0", PMEVTYPER19_EL0, {}}, - {"pmevtyper20_el0", PMEVTYPER20_EL0, {}}, - {"pmevtyper21_el0", PMEVTYPER21_EL0, {}}, - {"pmevtyper22_el0", PMEVTYPER22_EL0, {}}, - {"pmevtyper23_el0", PMEVTYPER23_EL0, {}}, - {"pmevtyper24_el0", PMEVTYPER24_EL0, {}}, - {"pmevtyper25_el0", PMEVTYPER25_EL0, {}}, - {"pmevtyper26_el0", PMEVTYPER26_EL0, {}}, - {"pmevtyper27_el0", PMEVTYPER27_EL0, {}}, - {"pmevtyper28_el0", PMEVTYPER28_EL0, {}}, - {"pmevtyper29_el0", PMEVTYPER29_EL0, {}}, - {"pmevtyper30_el0", PMEVTYPER30_EL0, {}}, - - // Trace registers - {"trcprgctlr", TRCPRGCTLR, {}}, - {"trcprocselr", TRCPROCSELR, {}}, - {"trcconfigr", TRCCONFIGR, {}}, - {"trcauxctlr", TRCAUXCTLR, {}}, - {"trceventctl0r", TRCEVENTCTL0R, {}}, - {"trceventctl1r", TRCEVENTCTL1R, {}}, - {"trcstallctlr", TRCSTALLCTLR, {}}, - {"trctsctlr", TRCTSCTLR, {}}, - {"trcsyncpr", TRCSYNCPR, {}}, - {"trcccctlr", TRCCCCTLR, {}}, - {"trcbbctlr", TRCBBCTLR, {}}, - {"trctraceidr", TRCTRACEIDR, {}}, - {"trcqctlr", TRCQCTLR, {}}, - {"trcvictlr", TRCVICTLR, {}}, - {"trcviiectlr", TRCVIIECTLR, {}}, - {"trcvissctlr", TRCVISSCTLR, {}}, - {"trcvipcssctlr", TRCVIPCSSCTLR, {}}, - {"trcvdctlr", TRCVDCTLR, {}}, - {"trcvdsacctlr", TRCVDSACCTLR, {}}, - {"trcvdarcctlr", TRCVDARCCTLR, {}}, - {"trcseqevr0", TRCSEQEVR0, {}}, - {"trcseqevr1", TRCSEQEVR1, {}}, - {"trcseqevr2", TRCSEQEVR2, {}}, - {"trcseqrstevr", TRCSEQRSTEVR, {}}, - {"trcseqstr", TRCSEQSTR, {}}, - {"trcextinselr", TRCEXTINSELR, {}}, - {"trccntrldvr0", TRCCNTRLDVR0, {}}, - {"trccntrldvr1", TRCCNTRLDVR1, {}}, - {"trccntrldvr2", TRCCNTRLDVR2, {}}, - {"trccntrldvr3", TRCCNTRLDVR3, {}}, - {"trccntctlr0", TRCCNTCTLR0, {}}, - {"trccntctlr1", TRCCNTCTLR1, {}}, - {"trccntctlr2", TRCCNTCTLR2, {}}, - {"trccntctlr3", TRCCNTCTLR3, {}}, - {"trccntvr0", TRCCNTVR0, {}}, - {"trccntvr1", TRCCNTVR1, {}}, - {"trccntvr2", TRCCNTVR2, {}}, - {"trccntvr3", TRCCNTVR3, {}}, - {"trcimspec0", TRCIMSPEC0, {}}, - {"trcimspec1", TRCIMSPEC1, {}}, - {"trcimspec2", TRCIMSPEC2, {}}, - {"trcimspec3", TRCIMSPEC3, {}}, - {"trcimspec4", TRCIMSPEC4, {}}, - {"trcimspec5", TRCIMSPEC5, {}}, - {"trcimspec6", TRCIMSPEC6, {}}, - {"trcimspec7", TRCIMSPEC7, {}}, - {"trcrsctlr2", TRCRSCTLR2, {}}, - {"trcrsctlr3", TRCRSCTLR3, {}}, - {"trcrsctlr4", TRCRSCTLR4, {}}, - {"trcrsctlr5", TRCRSCTLR5, {}}, - {"trcrsctlr6", TRCRSCTLR6, {}}, - {"trcrsctlr7", TRCRSCTLR7, {}}, - {"trcrsctlr8", TRCRSCTLR8, {}}, - {"trcrsctlr9", TRCRSCTLR9, {}}, - {"trcrsctlr10", TRCRSCTLR10, {}}, - {"trcrsctlr11", TRCRSCTLR11, {}}, - {"trcrsctlr12", TRCRSCTLR12, {}}, - {"trcrsctlr13", TRCRSCTLR13, {}}, - {"trcrsctlr14", TRCRSCTLR14, {}}, - {"trcrsctlr15", TRCRSCTLR15, {}}, - {"trcrsctlr16", TRCRSCTLR16, {}}, - {"trcrsctlr17", TRCRSCTLR17, {}}, - {"trcrsctlr18", TRCRSCTLR18, {}}, - {"trcrsctlr19", TRCRSCTLR19, {}}, - {"trcrsctlr20", TRCRSCTLR20, {}}, - {"trcrsctlr21", TRCRSCTLR21, {}}, - {"trcrsctlr22", TRCRSCTLR22, {}}, - {"trcrsctlr23", TRCRSCTLR23, {}}, - {"trcrsctlr24", TRCRSCTLR24, {}}, - {"trcrsctlr25", TRCRSCTLR25, {}}, - {"trcrsctlr26", TRCRSCTLR26, {}}, - {"trcrsctlr27", TRCRSCTLR27, {}}, - {"trcrsctlr28", TRCRSCTLR28, {}}, - {"trcrsctlr29", TRCRSCTLR29, {}}, - {"trcrsctlr30", TRCRSCTLR30, {}}, - {"trcrsctlr31", TRCRSCTLR31, {}}, - {"trcssccr0", TRCSSCCR0, {}}, - {"trcssccr1", TRCSSCCR1, {}}, - {"trcssccr2", TRCSSCCR2, {}}, - {"trcssccr3", TRCSSCCR3, {}}, - {"trcssccr4", TRCSSCCR4, {}}, - {"trcssccr5", TRCSSCCR5, {}}, - {"trcssccr6", TRCSSCCR6, {}}, - {"trcssccr7", TRCSSCCR7, {}}, - {"trcsscsr0", TRCSSCSR0, {}}, - {"trcsscsr1", TRCSSCSR1, {}}, - {"trcsscsr2", TRCSSCSR2, {}}, - {"trcsscsr3", TRCSSCSR3, {}}, - {"trcsscsr4", TRCSSCSR4, {}}, - {"trcsscsr5", TRCSSCSR5, {}}, - {"trcsscsr6", TRCSSCSR6, {}}, - {"trcsscsr7", TRCSSCSR7, {}}, - {"trcsspcicr0", TRCSSPCICR0, {}}, - {"trcsspcicr1", TRCSSPCICR1, {}}, - {"trcsspcicr2", TRCSSPCICR2, {}}, - {"trcsspcicr3", TRCSSPCICR3, {}}, - {"trcsspcicr4", TRCSSPCICR4, {}}, - {"trcsspcicr5", TRCSSPCICR5, {}}, - {"trcsspcicr6", TRCSSPCICR6, {}}, - {"trcsspcicr7", TRCSSPCICR7, {}}, - {"trcpdcr", TRCPDCR, {}}, - {"trcacvr0", TRCACVR0, {}}, - {"trcacvr1", TRCACVR1, {}}, - {"trcacvr2", TRCACVR2, {}}, - {"trcacvr3", TRCACVR3, {}}, - {"trcacvr4", TRCACVR4, {}}, - {"trcacvr5", TRCACVR5, {}}, - {"trcacvr6", TRCACVR6, {}}, - {"trcacvr7", TRCACVR7, {}}, - {"trcacvr8", TRCACVR8, {}}, - {"trcacvr9", TRCACVR9, {}}, - {"trcacvr10", TRCACVR10, {}}, - {"trcacvr11", TRCACVR11, {}}, - {"trcacvr12", TRCACVR12, {}}, - {"trcacvr13", TRCACVR13, {}}, - {"trcacvr14", TRCACVR14, {}}, - {"trcacvr15", TRCACVR15, {}}, - {"trcacatr0", TRCACATR0, {}}, - {"trcacatr1", TRCACATR1, {}}, - {"trcacatr2", TRCACATR2, {}}, - {"trcacatr3", TRCACATR3, {}}, - {"trcacatr4", TRCACATR4, {}}, - {"trcacatr5", TRCACATR5, {}}, - {"trcacatr6", TRCACATR6, {}}, - {"trcacatr7", TRCACATR7, {}}, - {"trcacatr8", TRCACATR8, {}}, - {"trcacatr9", TRCACATR9, {}}, - {"trcacatr10", TRCACATR10, {}}, - {"trcacatr11", TRCACATR11, {}}, - {"trcacatr12", TRCACATR12, {}}, - {"trcacatr13", TRCACATR13, {}}, - {"trcacatr14", TRCACATR14, {}}, - {"trcacatr15", TRCACATR15, {}}, - {"trcdvcvr0", TRCDVCVR0, {}}, - {"trcdvcvr1", TRCDVCVR1, {}}, - {"trcdvcvr2", TRCDVCVR2, {}}, - {"trcdvcvr3", TRCDVCVR3, {}}, - {"trcdvcvr4", TRCDVCVR4, {}}, - {"trcdvcvr5", TRCDVCVR5, {}}, - {"trcdvcvr6", TRCDVCVR6, {}}, - {"trcdvcvr7", TRCDVCVR7, {}}, - {"trcdvcmr0", TRCDVCMR0, {}}, - {"trcdvcmr1", TRCDVCMR1, {}}, - {"trcdvcmr2", TRCDVCMR2, {}}, - {"trcdvcmr3", TRCDVCMR3, {}}, - {"trcdvcmr4", TRCDVCMR4, {}}, - {"trcdvcmr5", TRCDVCMR5, {}}, - {"trcdvcmr6", TRCDVCMR6, {}}, - {"trcdvcmr7", TRCDVCMR7, {}}, - {"trccidcvr0", TRCCIDCVR0, {}}, - {"trccidcvr1", TRCCIDCVR1, {}}, - {"trccidcvr2", TRCCIDCVR2, {}}, - {"trccidcvr3", TRCCIDCVR3, {}}, - {"trccidcvr4", TRCCIDCVR4, {}}, - {"trccidcvr5", TRCCIDCVR5, {}}, - {"trccidcvr6", TRCCIDCVR6, {}}, - {"trccidcvr7", TRCCIDCVR7, {}}, - {"trcvmidcvr0", TRCVMIDCVR0, {}}, - {"trcvmidcvr1", TRCVMIDCVR1, {}}, - {"trcvmidcvr2", TRCVMIDCVR2, {}}, - {"trcvmidcvr3", TRCVMIDCVR3, {}}, - {"trcvmidcvr4", TRCVMIDCVR4, {}}, - {"trcvmidcvr5", TRCVMIDCVR5, {}}, - {"trcvmidcvr6", TRCVMIDCVR6, {}}, - {"trcvmidcvr7", TRCVMIDCVR7, {}}, - {"trccidcctlr0", TRCCIDCCTLR0, {}}, - {"trccidcctlr1", TRCCIDCCTLR1, {}}, - {"trcvmidcctlr0", TRCVMIDCCTLR0, {}}, - {"trcvmidcctlr1", TRCVMIDCCTLR1, {}}, - {"trcitctrl", TRCITCTRL, {}}, - {"trcclaimset", TRCCLAIMSET, {}}, - {"trcclaimclr", TRCCLAIMCLR, {}}, - - // GICv3 registers - {"icc_bpr1_el1", ICC_BPR1_EL1, {}}, - {"icc_bpr0_el1", ICC_BPR0_EL1, {}}, - {"icc_pmr_el1", ICC_PMR_EL1, {}}, - {"icc_ctlr_el1", ICC_CTLR_EL1, {}}, - {"icc_ctlr_el3", ICC_CTLR_EL3, {}}, - {"icc_sre_el1", ICC_SRE_EL1, {}}, - {"icc_sre_el2", ICC_SRE_EL2, {}}, - {"icc_sre_el3", ICC_SRE_EL3, {}}, - {"icc_igrpen0_el1", ICC_IGRPEN0_EL1, {}}, - {"icc_igrpen1_el1", ICC_IGRPEN1_EL1, {}}, - {"icc_igrpen1_el3", ICC_IGRPEN1_EL3, {}}, - {"icc_seien_el1", ICC_SEIEN_EL1, {}}, - {"icc_ap0r0_el1", ICC_AP0R0_EL1, {}}, - {"icc_ap0r1_el1", ICC_AP0R1_EL1, {}}, - {"icc_ap0r2_el1", ICC_AP0R2_EL1, {}}, - {"icc_ap0r3_el1", ICC_AP0R3_EL1, {}}, - {"icc_ap1r0_el1", ICC_AP1R0_EL1, {}}, - {"icc_ap1r1_el1", ICC_AP1R1_EL1, {}}, - {"icc_ap1r2_el1", ICC_AP1R2_EL1, {}}, - {"icc_ap1r3_el1", ICC_AP1R3_EL1, {}}, - {"ich_ap0r0_el2", ICH_AP0R0_EL2, {}}, - {"ich_ap0r1_el2", ICH_AP0R1_EL2, {}}, - {"ich_ap0r2_el2", ICH_AP0R2_EL2, {}}, - {"ich_ap0r3_el2", ICH_AP0R3_EL2, {}}, - {"ich_ap1r0_el2", ICH_AP1R0_EL2, {}}, - {"ich_ap1r1_el2", ICH_AP1R1_EL2, {}}, - {"ich_ap1r2_el2", ICH_AP1R2_EL2, {}}, - {"ich_ap1r3_el2", ICH_AP1R3_EL2, {}}, - {"ich_hcr_el2", ICH_HCR_EL2, {}}, - {"ich_misr_el2", ICH_MISR_EL2, {}}, - {"ich_vmcr_el2", ICH_VMCR_EL2, {}}, - {"ich_vseir_el2", ICH_VSEIR_EL2, {}}, - {"ich_lr0_el2", ICH_LR0_EL2, {}}, - {"ich_lr1_el2", ICH_LR1_EL2, {}}, - {"ich_lr2_el2", ICH_LR2_EL2, {}}, - {"ich_lr3_el2", ICH_LR3_EL2, {}}, - {"ich_lr4_el2", ICH_LR4_EL2, {}}, - {"ich_lr5_el2", ICH_LR5_EL2, {}}, - {"ich_lr6_el2", ICH_LR6_EL2, {}}, - {"ich_lr7_el2", ICH_LR7_EL2, {}}, - {"ich_lr8_el2", ICH_LR8_EL2, {}}, - {"ich_lr9_el2", ICH_LR9_EL2, {}}, - {"ich_lr10_el2", ICH_LR10_EL2, {}}, - {"ich_lr11_el2", ICH_LR11_EL2, {}}, - {"ich_lr12_el2", ICH_LR12_EL2, {}}, - {"ich_lr13_el2", ICH_LR13_EL2, {}}, - {"ich_lr14_el2", ICH_LR14_EL2, {}}, - {"ich_lr15_el2", ICH_LR15_EL2, {}}, - - // Cyclone registers - {"cpm_ioacc_ctl_el3", CPM_IOACC_CTL_EL3, {AArch64::ProcCyclone}}, - - // v8.1a "Privileged Access Never" extension-specific system registers - {"pan", PAN, {AArch64::HasV8_1aOps}}, - - // v8.1a "Limited Ordering Regions" extension-specific system registers - {"lorsa_el1", LORSA_EL1, {AArch64::HasV8_1aOps}}, - {"lorea_el1", LOREA_EL1, {AArch64::HasV8_1aOps}}, - {"lorn_el1", LORN_EL1, {AArch64::HasV8_1aOps}}, - {"lorc_el1", LORC_EL1, {AArch64::HasV8_1aOps}}, - - // v8.1a "Virtualization host extensions" system registers - {"ttbr1_el2", TTBR1_EL2, {AArch64::HasV8_1aOps}}, - {"contextidr_el2", CONTEXTIDR_EL2, {AArch64::HasV8_1aOps}}, - {"cnthv_tval_el2", CNTHV_TVAL_EL2, {AArch64::HasV8_1aOps}}, - {"cnthv_cval_el2", CNTHV_CVAL_EL2, {AArch64::HasV8_1aOps}}, - {"cnthv_ctl_el2", CNTHV_CTL_EL2, {AArch64::HasV8_1aOps}}, - {"sctlr_el12", SCTLR_EL12, {AArch64::HasV8_1aOps}}, - {"cpacr_el12", CPACR_EL12, {AArch64::HasV8_1aOps}}, - {"ttbr0_el12", TTBR0_EL12, {AArch64::HasV8_1aOps}}, - {"ttbr1_el12", TTBR1_EL12, {AArch64::HasV8_1aOps}}, - {"tcr_el12", TCR_EL12, {AArch64::HasV8_1aOps}}, - {"afsr0_el12", AFSR0_EL12, {AArch64::HasV8_1aOps}}, - {"afsr1_el12", AFSR1_EL12, {AArch64::HasV8_1aOps}}, - {"esr_el12", ESR_EL12, {AArch64::HasV8_1aOps}}, - {"far_el12", FAR_EL12, {AArch64::HasV8_1aOps}}, - {"mair_el12", MAIR_EL12, {AArch64::HasV8_1aOps}}, - {"amair_el12", AMAIR_EL12, {AArch64::HasV8_1aOps}}, - {"vbar_el12", VBAR_EL12, {AArch64::HasV8_1aOps}}, - {"contextidr_el12", CONTEXTIDR_EL12, {AArch64::HasV8_1aOps}}, - {"cntkctl_el12", CNTKCTL_EL12, {AArch64::HasV8_1aOps}}, - {"cntp_tval_el02", CNTP_TVAL_EL02, {AArch64::HasV8_1aOps}}, - {"cntp_ctl_el02", CNTP_CTL_EL02, {AArch64::HasV8_1aOps}}, - {"cntp_cval_el02", CNTP_CVAL_EL02, {AArch64::HasV8_1aOps}}, - {"cntv_tval_el02", CNTV_TVAL_EL02, {AArch64::HasV8_1aOps}}, - {"cntv_ctl_el02", CNTV_CTL_EL02, {AArch64::HasV8_1aOps}}, - {"cntv_cval_el02", CNTV_CVAL_EL02, {AArch64::HasV8_1aOps}}, - {"spsr_el12", SPSR_EL12, {AArch64::HasV8_1aOps}}, - {"elr_el12", ELR_EL12, {AArch64::HasV8_1aOps}}, - - // v8.2a registers - {"uao", UAO, {AArch64::HasV8_2aOps}}, - - // v8.2a "Statistical Profiling extension" registers - {"pmblimitr_el1", PMBLIMITR_EL1, {AArch64::FeatureSPE}}, - {"pmbptr_el1", PMBPTR_EL1, {AArch64::FeatureSPE}}, - {"pmbsr_el1", PMBSR_EL1, {AArch64::FeatureSPE}}, - {"pmbidr_el1", PMBIDR_EL1, {AArch64::FeatureSPE}}, - {"pmscr_el2", PMSCR_EL2, {AArch64::FeatureSPE}}, - {"pmscr_el12", PMSCR_EL12, {AArch64::FeatureSPE}}, - {"pmscr_el1", PMSCR_EL1, {AArch64::FeatureSPE}}, - {"pmsicr_el1", PMSICR_EL1, {AArch64::FeatureSPE}}, - {"pmsirr_el1", PMSIRR_EL1, {AArch64::FeatureSPE}}, - {"pmsfcr_el1", PMSFCR_EL1, {AArch64::FeatureSPE}}, - {"pmsevfr_el1", PMSEVFR_EL1, {AArch64::FeatureSPE}}, - {"pmslatfr_el1", PMSLATFR_EL1, {AArch64::FeatureSPE}}, - {"pmsidr_el1", PMSIDR_EL1, {AArch64::FeatureSPE}}, -}; - -uint32_t -AArch64SysReg::SysRegMapper::fromString(StringRef Name, - const FeatureBitset& FeatureBits, bool &Valid) const { - std::string NameLower = Name.lower(); - - // First search the registers shared by all - for (unsigned i = 0; i < array_lengthof(SysRegMappings); ++i) { - if (SysRegMappings[i].isNameEqual(NameLower, FeatureBits)) { - Valid = true; - return SysRegMappings[i].Value; - } +namespace llvm { + namespace AArch64PSBHint { +#define GET_PSB_IMPL +#include "AArch64GenSystemOperands.inc" } +} - // Now try the instruction-specific registers (either read-only or - // write-only). - for (unsigned i = 0; i < NumInstMappings; ++i) { - if (InstMappings[i].isNameEqual(NameLower, FeatureBits)) { - Valid = true; - return InstMappings[i].Value; - } +namespace llvm { + namespace AArch64SysReg { +#define GET_SYSREG_IMPL +#include "AArch64GenSystemOperands.inc" } +} +uint32_t AArch64SysReg::parseGenericRegister(StringRef Name) { // Try to parse an S<op0>_<op1>_<Cn>_<Cm>_<op2> register name - Regex GenericRegPattern("^s([0-3])_([0-7])_c([0-9]|1[0-5])_c([0-9]|1[0-5])_([0-7])$"); + Regex GenericRegPattern("^S([0-3])_([0-7])_C([0-9]|1[0-5])_C([0-9]|1[0-5])_([0-7])$"); + std::string UpperName = Name.upper(); SmallVector<StringRef, 5> Ops; - if (!GenericRegPattern.match(NameLower, &Ops)) { - Valid = false; + if (!GenericRegPattern.match(UpperName, &Ops)) return -1; - } uint32_t Op0 = 0, Op1 = 0, CRn = 0, CRm = 0, Op2 = 0; uint32_t Bits; @@ -873,28 +99,10 @@ AArch64SysReg::SysRegMapper::fromString(StringRef Name, Ops[5].getAsInteger(10, Op2); Bits = (Op0 << 14) | (Op1 << 11) | (CRn << 7) | (CRm << 3) | Op2; - Valid = true; return Bits; } -std::string -AArch64SysReg::SysRegMapper::toString(uint32_t Bits, - const FeatureBitset& FeatureBits) const { - // First search the registers shared by all - for (unsigned i = 0; i < array_lengthof(SysRegMappings); ++i) { - if (SysRegMappings[i].isValueEqual(Bits, FeatureBits)) { - return SysRegMappings[i].Name; - } - } - - // Now try the instruction-specific registers (either read-only or - // write-only). - for (unsigned i = 0; i < NumInstMappings; ++i) { - if (InstMappings[i].isValueEqual(Bits, FeatureBits)) { - return InstMappings[i].Name; - } - } - +std::string AArch64SysReg::genericRegisterString(uint32_t Bits) { assert(Bits < 0x10000); uint32_t Op0 = (Bits >> 14) & 0x3; uint32_t Op1 = (Bits >> 11) & 0x7; @@ -902,44 +110,13 @@ AArch64SysReg::SysRegMapper::toString(uint32_t Bits, uint32_t CRm = (Bits >> 3) & 0xf; uint32_t Op2 = Bits & 0x7; - return "s" + utostr(Op0)+ "_" + utostr(Op1) + "_c" + utostr(CRn) - + "_c" + utostr(CRm) + "_" + utostr(Op2); + return "S" + utostr(Op0) + "_" + utostr(Op1) + "_C" + utostr(CRn) + "_C" + + utostr(CRm) + "_" + utostr(Op2); } -const AArch64NamedImmMapper::Mapping AArch64TLBI::TLBIMapper::TLBIMappings[] = { - {"ipas2e1is", IPAS2E1IS, {}}, - {"ipas2le1is", IPAS2LE1IS, {}}, - {"vmalle1is", VMALLE1IS, {}}, - {"alle2is", ALLE2IS, {}}, - {"alle3is", ALLE3IS, {}}, - {"vae1is", VAE1IS, {}}, - {"vae2is", VAE2IS, {}}, - {"vae3is", VAE3IS, {}}, - {"aside1is", ASIDE1IS, {}}, - {"vaae1is", VAAE1IS, {}}, - {"alle1is", ALLE1IS, {}}, - {"vale1is", VALE1IS, {}}, - {"vale2is", VALE2IS, {}}, - {"vale3is", VALE3IS, {}}, - {"vmalls12e1is", VMALLS12E1IS, {}}, - {"vaale1is", VAALE1IS, {}}, - {"ipas2e1", IPAS2E1, {}}, - {"ipas2le1", IPAS2LE1, {}}, - {"vmalle1", VMALLE1, {}}, - {"alle2", ALLE2, {}}, - {"alle3", ALLE3, {}}, - {"vae1", VAE1, {}}, - {"vae2", VAE2, {}}, - {"vae3", VAE3, {}}, - {"aside1", ASIDE1, {}}, - {"vaae1", VAAE1, {}}, - {"alle1", ALLE1, {}}, - {"vale1", VALE1, {}}, - {"vale2", VALE2, {}}, - {"vale3", VALE3, {}}, - {"vmalls12e1", VMALLS12E1, {}}, - {"vaale1", VAALE1, {}} -}; - -AArch64TLBI::TLBIMapper::TLBIMapper() - : AArch64NamedImmMapper(TLBIMappings, 0) {} +namespace llvm { + namespace AArch64TLBI { +#define GET_TLBI_IMPL +#include "AArch64GenSystemOperands.inc" + } +} diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h index e63627eae123b..dcc39176031c5 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -266,231 +266,85 @@ inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) { } } // end namespace AArch64CC -/// Instances of this class can perform bidirectional mapping from random -/// identifier strings to operand encodings. For example "MSR" takes a named -/// system-register which must be encoded somehow and decoded for printing. This -/// central location means that the information for those transformations is not -/// duplicated and remains in sync. -/// -/// FIXME: currently the algorithm is a completely unoptimised linear -/// search. Obviously this could be improved, but we would probably want to work -/// out just how often these instructions are emitted before working on it. It -/// might even be optimal to just reorder the tables for the common instructions -/// rather than changing the algorithm. -struct AArch64NamedImmMapper { - struct Mapping { +namespace AArch64AT{ + struct AT { const char *Name; - uint32_t Value; - // Set of features this mapping is available for - // Zero value of FeatureBitSet means the mapping is always available - FeatureBitset FeatureBitSet; - - bool isNameEqual(std::string Other, - const FeatureBitset& FeatureBits) const { - if (FeatureBitSet.any() && - (FeatureBitSet & FeatureBits).none()) - return false; - return Name == Other; - } - - bool isValueEqual(uint32_t Other, - const FeatureBitset& FeatureBits) const { - if (FeatureBitSet.any() && - (FeatureBitSet & FeatureBits).none()) - return false; - return Value == Other; - } - }; - - template<int N> - AArch64NamedImmMapper(const Mapping (&Mappings)[N], uint32_t TooBigImm) - : Mappings(&Mappings[0]), NumMappings(N), TooBigImm(TooBigImm) {} - - // Maps value to string, depending on availability for FeatureBits given - StringRef toString(uint32_t Value, const FeatureBitset& FeatureBits, - bool &Valid) const; - // Maps string to value, depending on availability for FeatureBits given - uint32_t fromString(StringRef Name, const FeatureBitset& FeatureBits, - bool &Valid) const; - - /// Many of the instructions allow an alternative assembly form consisting of - /// a simple immediate. Currently the only valid forms are ranges [0, N) where - /// N being 0 indicates no immediate syntax-form is allowed. - bool validImm(uint32_t Value) const; -protected: - const Mapping *Mappings; - size_t NumMappings; - uint32_t TooBigImm; -}; - -namespace AArch64AT { - enum ATValues { - Invalid = -1, // Op0 Op1 CRn CRm Op2 - S1E1R = 0x43c0, // 01 000 0111 1000 000 - S1E2R = 0x63c0, // 01 100 0111 1000 000 - S1E3R = 0x73c0, // 01 110 0111 1000 000 - S1E1W = 0x43c1, // 01 000 0111 1000 001 - S1E2W = 0x63c1, // 01 100 0111 1000 001 - S1E3W = 0x73c1, // 01 110 0111 1000 001 - S1E0R = 0x43c2, // 01 000 0111 1000 010 - S1E0W = 0x43c3, // 01 000 0111 1000 011 - S12E1R = 0x63c4, // 01 100 0111 1000 100 - S12E1W = 0x63c5, // 01 100 0111 1000 101 - S12E0R = 0x63c6, // 01 100 0111 1000 110 - S12E0W = 0x63c7, // 01 100 0111 1000 111 - S1E1RP = 0x43c8, // 01 000 0111 1001 000 - S1E1WP = 0x43c9 // 01 000 0111 1001 001 + uint16_t Encoding; }; - struct ATMapper : AArch64NamedImmMapper { - const static Mapping ATMappings[]; - - ATMapper(); - }; + #define GET_AT_DECL + #include "AArch64GenSystemOperands.inc" } namespace AArch64DB { - enum DBValues { - Invalid = -1, - OSHLD = 0x1, - OSHST = 0x2, - OSH = 0x3, - NSHLD = 0x5, - NSHST = 0x6, - NSH = 0x7, - ISHLD = 0x9, - ISHST = 0xa, - ISH = 0xb, - LD = 0xd, - ST = 0xe, - SY = 0xf + struct DB { + const char *Name; + uint16_t Encoding; }; - struct DBarrierMapper : AArch64NamedImmMapper { - const static Mapping DBarrierMappings[]; - - DBarrierMapper(); - }; + #define GET_DB_DECL + #include "AArch64GenSystemOperands.inc" } namespace AArch64DC { - enum DCValues { - Invalid = -1, // Op1 CRn CRm Op2 - ZVA = 0x5ba1, // 01 011 0111 0100 001 - IVAC = 0x43b1, // 01 000 0111 0110 001 - ISW = 0x43b2, // 01 000 0111 0110 010 - CVAC = 0x5bd1, // 01 011 0111 1010 001 - CSW = 0x43d2, // 01 000 0111 1010 010 - CVAU = 0x5bd9, // 01 011 0111 1011 001 - CIVAC = 0x5bf1, // 01 011 0111 1110 001 - CISW = 0x43f2 // 01 000 0111 1110 010 - }; - - struct DCMapper : AArch64NamedImmMapper { - const static Mapping DCMappings[]; - - DCMapper(); + struct DC { + const char *Name; + uint16_t Encoding; }; + #define GET_DC_DECL + #include "AArch64GenSystemOperands.inc" } namespace AArch64IC { - enum ICValues { - Invalid = -1, // Op1 CRn CRm Op2 - IALLUIS = 0x0388, // 000 0111 0001 000 - IALLU = 0x03a8, // 000 0111 0101 000 - IVAU = 0x1ba9 // 011 0111 0101 001 - }; - - - struct ICMapper : AArch64NamedImmMapper { - const static Mapping ICMappings[]; - - ICMapper(); + struct IC { + const char *Name; + uint16_t Encoding; + bool NeedsReg; }; - - static inline bool NeedsRegister(ICValues Val) { - return Val == IVAU; - } + #define GET_IC_DECL + #include "AArch64GenSystemOperands.inc" } namespace AArch64ISB { - enum ISBValues { - Invalid = -1, - SY = 0xf - }; - struct ISBMapper : AArch64NamedImmMapper { - const static Mapping ISBMappings[]; - - ISBMapper(); + struct ISB { + const char *Name; + uint16_t Encoding; }; + #define GET_ISB_DECL + #include "AArch64GenSystemOperands.inc" } namespace AArch64PRFM { - enum PRFMValues { - Invalid = -1, - PLDL1KEEP = 0x00, - PLDL1STRM = 0x01, - PLDL2KEEP = 0x02, - PLDL2STRM = 0x03, - PLDL3KEEP = 0x04, - PLDL3STRM = 0x05, - PLIL1KEEP = 0x08, - PLIL1STRM = 0x09, - PLIL2KEEP = 0x0a, - PLIL2STRM = 0x0b, - PLIL3KEEP = 0x0c, - PLIL3STRM = 0x0d, - PSTL1KEEP = 0x10, - PSTL1STRM = 0x11, - PSTL2KEEP = 0x12, - PSTL2STRM = 0x13, - PSTL3KEEP = 0x14, - PSTL3STRM = 0x15 - }; - - struct PRFMMapper : AArch64NamedImmMapper { - const static Mapping PRFMMappings[]; - - PRFMMapper(); + struct PRFM { + const char *Name; + uint16_t Encoding; }; + #define GET_PRFM_DECL + #include "AArch64GenSystemOperands.inc" } namespace AArch64PState { - enum PStateValues { - Invalid = -1, - SPSel = 0x05, - DAIFSet = 0x1e, - DAIFClr = 0x1f, - - // v8.1a "Privileged Access Never" extension-specific PStates - PAN = 0x04, - - // v8.2a "User Access Override" extension-specific PStates - UAO = 0x03 - }; - - struct PStateMapper : AArch64NamedImmMapper { - const static Mapping PStateMappings[]; + struct PState { + const char *Name; + uint16_t Encoding; + FeatureBitset FeaturesRequired; - PStateMapper(); + bool haveFeatures(FeatureBitset ActiveFeatures) const { + return (FeaturesRequired & ActiveFeatures) == FeaturesRequired; + } }; - + #define GET_PSTATE_DECL + #include "AArch64GenSystemOperands.inc" } namespace AArch64PSBHint { - enum PSBHintValues { - Invalid = -1, - // v8.2a "Statistical Profiling" extension-specific PSB operands - CSync = 0x11, // psb csync = hint #0x11 - }; - - struct PSBHintMapper : AArch64NamedImmMapper { - const static Mapping PSBHintMappings[]; - - PSBHintMapper(); + struct PSB { + const char *Name; + uint16_t Encoding; }; - + #define GET_PSB_DECL + #include "AArch64GenSystemOperands.inc" } namespace AArch64SE { @@ -574,754 +428,36 @@ AArch64StringToVectorLayout(StringRef LayoutStr) { } namespace AArch64SysReg { - enum SysRegROValues { - MDCCSR_EL0 = 0x9808, // 10 011 0000 0001 000 - DBGDTRRX_EL0 = 0x9828, // 10 011 0000 0101 000 - MDRAR_EL1 = 0x8080, // 10 000 0001 0000 000 - OSLSR_EL1 = 0x808c, // 10 000 0001 0001 100 - DBGAUTHSTATUS_EL1 = 0x83f6, // 10 000 0111 1110 110 - PMCEID0_EL0 = 0xdce6, // 11 011 1001 1100 110 - PMCEID1_EL0 = 0xdce7, // 11 011 1001 1100 111 - MIDR_EL1 = 0xc000, // 11 000 0000 0000 000 - CCSIDR_EL1 = 0xc800, // 11 001 0000 0000 000 - CLIDR_EL1 = 0xc801, // 11 001 0000 0000 001 - CTR_EL0 = 0xd801, // 11 011 0000 0000 001 - MPIDR_EL1 = 0xc005, // 11 000 0000 0000 101 - REVIDR_EL1 = 0xc006, // 11 000 0000 0000 110 - AIDR_EL1 = 0xc807, // 11 001 0000 0000 111 - DCZID_EL0 = 0xd807, // 11 011 0000 0000 111 - ID_PFR0_EL1 = 0xc008, // 11 000 0000 0001 000 - ID_PFR1_EL1 = 0xc009, // 11 000 0000 0001 001 - ID_DFR0_EL1 = 0xc00a, // 11 000 0000 0001 010 - ID_AFR0_EL1 = 0xc00b, // 11 000 0000 0001 011 - ID_MMFR0_EL1 = 0xc00c, // 11 000 0000 0001 100 - ID_MMFR1_EL1 = 0xc00d, // 11 000 0000 0001 101 - ID_MMFR2_EL1 = 0xc00e, // 11 000 0000 0001 110 - ID_MMFR3_EL1 = 0xc00f, // 11 000 0000 0001 111 - ID_ISAR0_EL1 = 0xc010, // 11 000 0000 0010 000 - ID_ISAR1_EL1 = 0xc011, // 11 000 0000 0010 001 - ID_ISAR2_EL1 = 0xc012, // 11 000 0000 0010 010 - ID_ISAR3_EL1 = 0xc013, // 11 000 0000 0010 011 - ID_ISAR4_EL1 = 0xc014, // 11 000 0000 0010 100 - ID_ISAR5_EL1 = 0xc015, // 11 000 0000 0010 101 - ID_A64PFR0_EL1 = 0xc020, // 11 000 0000 0100 000 - ID_A64PFR1_EL1 = 0xc021, // 11 000 0000 0100 001 - ID_A64DFR0_EL1 = 0xc028, // 11 000 0000 0101 000 - ID_A64DFR1_EL1 = 0xc029, // 11 000 0000 0101 001 - ID_A64AFR0_EL1 = 0xc02c, // 11 000 0000 0101 100 - ID_A64AFR1_EL1 = 0xc02d, // 11 000 0000 0101 101 - ID_A64ISAR0_EL1 = 0xc030, // 11 000 0000 0110 000 - ID_A64ISAR1_EL1 = 0xc031, // 11 000 0000 0110 001 - ID_A64MMFR0_EL1 = 0xc038, // 11 000 0000 0111 000 - ID_A64MMFR1_EL1 = 0xc039, // 11 000 0000 0111 001 - ID_A64MMFR2_EL1 = 0xc03a, // 11 000 0000 0111 010 - MVFR0_EL1 = 0xc018, // 11 000 0000 0011 000 - MVFR1_EL1 = 0xc019, // 11 000 0000 0011 001 - MVFR2_EL1 = 0xc01a, // 11 000 0000 0011 010 - RVBAR_EL1 = 0xc601, // 11 000 1100 0000 001 - RVBAR_EL2 = 0xe601, // 11 100 1100 0000 001 - RVBAR_EL3 = 0xf601, // 11 110 1100 0000 001 - ISR_EL1 = 0xc608, // 11 000 1100 0001 000 - CNTPCT_EL0 = 0xdf01, // 11 011 1110 0000 001 - CNTVCT_EL0 = 0xdf02, // 11 011 1110 0000 010 - ID_MMFR4_EL1 = 0xc016, // 11 000 0000 0010 110 - - // Trace registers - TRCSTATR = 0x8818, // 10 001 0000 0011 000 - TRCIDR8 = 0x8806, // 10 001 0000 0000 110 - TRCIDR9 = 0x880e, // 10 001 0000 0001 110 - TRCIDR10 = 0x8816, // 10 001 0000 0010 110 - TRCIDR11 = 0x881e, // 10 001 0000 0011 110 - TRCIDR12 = 0x8826, // 10 001 0000 0100 110 - TRCIDR13 = 0x882e, // 10 001 0000 0101 110 - TRCIDR0 = 0x8847, // 10 001 0000 1000 111 - TRCIDR1 = 0x884f, // 10 001 0000 1001 111 - TRCIDR2 = 0x8857, // 10 001 0000 1010 111 - TRCIDR3 = 0x885f, // 10 001 0000 1011 111 - TRCIDR4 = 0x8867, // 10 001 0000 1100 111 - TRCIDR5 = 0x886f, // 10 001 0000 1101 111 - TRCIDR6 = 0x8877, // 10 001 0000 1110 111 - TRCIDR7 = 0x887f, // 10 001 0000 1111 111 - TRCOSLSR = 0x888c, // 10 001 0001 0001 100 - TRCPDSR = 0x88ac, // 10 001 0001 0101 100 - TRCDEVAFF0 = 0x8bd6, // 10 001 0111 1010 110 - TRCDEVAFF1 = 0x8bde, // 10 001 0111 1011 110 - TRCLSR = 0x8bee, // 10 001 0111 1101 110 - TRCAUTHSTATUS = 0x8bf6, // 10 001 0111 1110 110 - TRCDEVARCH = 0x8bfe, // 10 001 0111 1111 110 - TRCDEVID = 0x8b97, // 10 001 0111 0010 111 - TRCDEVTYPE = 0x8b9f, // 10 001 0111 0011 111 - TRCPIDR4 = 0x8ba7, // 10 001 0111 0100 111 - TRCPIDR5 = 0x8baf, // 10 001 0111 0101 111 - TRCPIDR6 = 0x8bb7, // 10 001 0111 0110 111 - TRCPIDR7 = 0x8bbf, // 10 001 0111 0111 111 - TRCPIDR0 = 0x8bc7, // 10 001 0111 1000 111 - TRCPIDR1 = 0x8bcf, // 10 001 0111 1001 111 - TRCPIDR2 = 0x8bd7, // 10 001 0111 1010 111 - TRCPIDR3 = 0x8bdf, // 10 001 0111 1011 111 - TRCCIDR0 = 0x8be7, // 10 001 0111 1100 111 - TRCCIDR1 = 0x8bef, // 10 001 0111 1101 111 - TRCCIDR2 = 0x8bf7, // 10 001 0111 1110 111 - TRCCIDR3 = 0x8bff, // 10 001 0111 1111 111 - - // GICv3 registers - ICC_IAR1_EL1 = 0xc660, // 11 000 1100 1100 000 - ICC_IAR0_EL1 = 0xc640, // 11 000 1100 1000 000 - ICC_HPPIR1_EL1 = 0xc662, // 11 000 1100 1100 010 - ICC_HPPIR0_EL1 = 0xc642, // 11 000 1100 1000 010 - ICC_RPR_EL1 = 0xc65b, // 11 000 1100 1011 011 - ICH_VTR_EL2 = 0xe659, // 11 100 1100 1011 001 - ICH_EISR_EL2 = 0xe65b, // 11 100 1100 1011 011 - ICH_ELSR_EL2 = 0xe65d // 11 100 1100 1011 101 - }; - - enum SysRegWOValues { - DBGDTRTX_EL0 = 0x9828, // 10 011 0000 0101 000 - OSLAR_EL1 = 0x8084, // 10 000 0001 0000 100 - PMSWINC_EL0 = 0xdce4, // 11 011 1001 1100 100 - - // Trace Registers - TRCOSLAR = 0x8884, // 10 001 0001 0000 100 - TRCLAR = 0x8be6, // 10 001 0111 1100 110 - - // GICv3 registers - ICC_EOIR1_EL1 = 0xc661, // 11 000 1100 1100 001 - ICC_EOIR0_EL1 = 0xc641, // 11 000 1100 1000 001 - ICC_DIR_EL1 = 0xc659, // 11 000 1100 1011 001 - ICC_SGI1R_EL1 = 0xc65d, // 11 000 1100 1011 101 - ICC_ASGI1R_EL1 = 0xc65e, // 11 000 1100 1011 110 - ICC_SGI0R_EL1 = 0xc65f // 11 000 1100 1011 111 - }; - - enum SysRegValues { - Invalid = -1, // Op0 Op1 CRn CRm Op2 - OSDTRRX_EL1 = 0x8002, // 10 000 0000 0000 010 - OSDTRTX_EL1 = 0x801a, // 10 000 0000 0011 010 - TEECR32_EL1 = 0x9000, // 10 010 0000 0000 000 - MDCCINT_EL1 = 0x8010, // 10 000 0000 0010 000 - MDSCR_EL1 = 0x8012, // 10 000 0000 0010 010 - DBGDTR_EL0 = 0x9820, // 10 011 0000 0100 000 - OSECCR_EL1 = 0x8032, // 10 000 0000 0110 010 - DBGVCR32_EL2 = 0xa038, // 10 100 0000 0111 000 - DBGBVR0_EL1 = 0x8004, // 10 000 0000 0000 100 - DBGBVR1_EL1 = 0x800c, // 10 000 0000 0001 100 - DBGBVR2_EL1 = 0x8014, // 10 000 0000 0010 100 - DBGBVR3_EL1 = 0x801c, // 10 000 0000 0011 100 - DBGBVR4_EL1 = 0x8024, // 10 000 0000 0100 100 - DBGBVR5_EL1 = 0x802c, // 10 000 0000 0101 100 - DBGBVR6_EL1 = 0x8034, // 10 000 0000 0110 100 - DBGBVR7_EL1 = 0x803c, // 10 000 0000 0111 100 - DBGBVR8_EL1 = 0x8044, // 10 000 0000 1000 100 - DBGBVR9_EL1 = 0x804c, // 10 000 0000 1001 100 - DBGBVR10_EL1 = 0x8054, // 10 000 0000 1010 100 - DBGBVR11_EL1 = 0x805c, // 10 000 0000 1011 100 - DBGBVR12_EL1 = 0x8064, // 10 000 0000 1100 100 - DBGBVR13_EL1 = 0x806c, // 10 000 0000 1101 100 - DBGBVR14_EL1 = 0x8074, // 10 000 0000 1110 100 - DBGBVR15_EL1 = 0x807c, // 10 000 0000 1111 100 - DBGBCR0_EL1 = 0x8005, // 10 000 0000 0000 101 - DBGBCR1_EL1 = 0x800d, // 10 000 0000 0001 101 - DBGBCR2_EL1 = 0x8015, // 10 000 0000 0010 101 - DBGBCR3_EL1 = 0x801d, // 10 000 0000 0011 101 - DBGBCR4_EL1 = 0x8025, // 10 000 0000 0100 101 - DBGBCR5_EL1 = 0x802d, // 10 000 0000 0101 101 - DBGBCR6_EL1 = 0x8035, // 10 000 0000 0110 101 - DBGBCR7_EL1 = 0x803d, // 10 000 0000 0111 101 - DBGBCR8_EL1 = 0x8045, // 10 000 0000 1000 101 - DBGBCR9_EL1 = 0x804d, // 10 000 0000 1001 101 - DBGBCR10_EL1 = 0x8055, // 10 000 0000 1010 101 - DBGBCR11_EL1 = 0x805d, // 10 000 0000 1011 101 - DBGBCR12_EL1 = 0x8065, // 10 000 0000 1100 101 - DBGBCR13_EL1 = 0x806d, // 10 000 0000 1101 101 - DBGBCR14_EL1 = 0x8075, // 10 000 0000 1110 101 - DBGBCR15_EL1 = 0x807d, // 10 000 0000 1111 101 - DBGWVR0_EL1 = 0x8006, // 10 000 0000 0000 110 - DBGWVR1_EL1 = 0x800e, // 10 000 0000 0001 110 - DBGWVR2_EL1 = 0x8016, // 10 000 0000 0010 110 - DBGWVR3_EL1 = 0x801e, // 10 000 0000 0011 110 - DBGWVR4_EL1 = 0x8026, // 10 000 0000 0100 110 - DBGWVR5_EL1 = 0x802e, // 10 000 0000 0101 110 - DBGWVR6_EL1 = 0x8036, // 10 000 0000 0110 110 - DBGWVR7_EL1 = 0x803e, // 10 000 0000 0111 110 - DBGWVR8_EL1 = 0x8046, // 10 000 0000 1000 110 - DBGWVR9_EL1 = 0x804e, // 10 000 0000 1001 110 - DBGWVR10_EL1 = 0x8056, // 10 000 0000 1010 110 - DBGWVR11_EL1 = 0x805e, // 10 000 0000 1011 110 - DBGWVR12_EL1 = 0x8066, // 10 000 0000 1100 110 - DBGWVR13_EL1 = 0x806e, // 10 000 0000 1101 110 - DBGWVR14_EL1 = 0x8076, // 10 000 0000 1110 110 - DBGWVR15_EL1 = 0x807e, // 10 000 0000 1111 110 - DBGWCR0_EL1 = 0x8007, // 10 000 0000 0000 111 - DBGWCR1_EL1 = 0x800f, // 10 000 0000 0001 111 - DBGWCR2_EL1 = 0x8017, // 10 000 0000 0010 111 - DBGWCR3_EL1 = 0x801f, // 10 000 0000 0011 111 - DBGWCR4_EL1 = 0x8027, // 10 000 0000 0100 111 - DBGWCR5_EL1 = 0x802f, // 10 000 0000 0101 111 - DBGWCR6_EL1 = 0x8037, // 10 000 0000 0110 111 - DBGWCR7_EL1 = 0x803f, // 10 000 0000 0111 111 - DBGWCR8_EL1 = 0x8047, // 10 000 0000 1000 111 - DBGWCR9_EL1 = 0x804f, // 10 000 0000 1001 111 - DBGWCR10_EL1 = 0x8057, // 10 000 0000 1010 111 - DBGWCR11_EL1 = 0x805f, // 10 000 0000 1011 111 - DBGWCR12_EL1 = 0x8067, // 10 000 0000 1100 111 - DBGWCR13_EL1 = 0x806f, // 10 000 0000 1101 111 - DBGWCR14_EL1 = 0x8077, // 10 000 0000 1110 111 - DBGWCR15_EL1 = 0x807f, // 10 000 0000 1111 111 - TEEHBR32_EL1 = 0x9080, // 10 010 0001 0000 000 - OSDLR_EL1 = 0x809c, // 10 000 0001 0011 100 - DBGPRCR_EL1 = 0x80a4, // 10 000 0001 0100 100 - DBGCLAIMSET_EL1 = 0x83c6, // 10 000 0111 1000 110 - DBGCLAIMCLR_EL1 = 0x83ce, // 10 000 0111 1001 110 - CSSELR_EL1 = 0xd000, // 11 010 0000 0000 000 - VPIDR_EL2 = 0xe000, // 11 100 0000 0000 000 - VMPIDR_EL2 = 0xe005, // 11 100 0000 0000 101 - CPACR_EL1 = 0xc082, // 11 000 0001 0000 010 - SCTLR_EL1 = 0xc080, // 11 000 0001 0000 000 - SCTLR_EL2 = 0xe080, // 11 100 0001 0000 000 - SCTLR_EL3 = 0xf080, // 11 110 0001 0000 000 - ACTLR_EL1 = 0xc081, // 11 000 0001 0000 001 - ACTLR_EL2 = 0xe081, // 11 100 0001 0000 001 - ACTLR_EL3 = 0xf081, // 11 110 0001 0000 001 - HCR_EL2 = 0xe088, // 11 100 0001 0001 000 - SCR_EL3 = 0xf088, // 11 110 0001 0001 000 - MDCR_EL2 = 0xe089, // 11 100 0001 0001 001 - SDER32_EL3 = 0xf089, // 11 110 0001 0001 001 - CPTR_EL2 = 0xe08a, // 11 100 0001 0001 010 - CPTR_EL3 = 0xf08a, // 11 110 0001 0001 010 - HSTR_EL2 = 0xe08b, // 11 100 0001 0001 011 - HACR_EL2 = 0xe08f, // 11 100 0001 0001 111 - MDCR_EL3 = 0xf099, // 11 110 0001 0011 001 - TTBR0_EL1 = 0xc100, // 11 000 0010 0000 000 - TTBR0_EL2 = 0xe100, // 11 100 0010 0000 000 - TTBR0_EL3 = 0xf100, // 11 110 0010 0000 000 - TTBR1_EL1 = 0xc101, // 11 000 0010 0000 001 - TCR_EL1 = 0xc102, // 11 000 0010 0000 010 - TCR_EL2 = 0xe102, // 11 100 0010 0000 010 - TCR_EL3 = 0xf102, // 11 110 0010 0000 010 - VTTBR_EL2 = 0xe108, // 11 100 0010 0001 000 - VTCR_EL2 = 0xe10a, // 11 100 0010 0001 010 - DACR32_EL2 = 0xe180, // 11 100 0011 0000 000 - SPSR_EL1 = 0xc200, // 11 000 0100 0000 000 - SPSR_EL2 = 0xe200, // 11 100 0100 0000 000 - SPSR_EL3 = 0xf200, // 11 110 0100 0000 000 - ELR_EL1 = 0xc201, // 11 000 0100 0000 001 - ELR_EL2 = 0xe201, // 11 100 0100 0000 001 - ELR_EL3 = 0xf201, // 11 110 0100 0000 001 - SP_EL0 = 0xc208, // 11 000 0100 0001 000 - SP_EL1 = 0xe208, // 11 100 0100 0001 000 - SP_EL2 = 0xf208, // 11 110 0100 0001 000 - SPSel = 0xc210, // 11 000 0100 0010 000 - NZCV = 0xda10, // 11 011 0100 0010 000 - DAIF = 0xda11, // 11 011 0100 0010 001 - CurrentEL = 0xc212, // 11 000 0100 0010 010 - SPSR_irq = 0xe218, // 11 100 0100 0011 000 - SPSR_abt = 0xe219, // 11 100 0100 0011 001 - SPSR_und = 0xe21a, // 11 100 0100 0011 010 - SPSR_fiq = 0xe21b, // 11 100 0100 0011 011 - FPCR = 0xda20, // 11 011 0100 0100 000 - FPSR = 0xda21, // 11 011 0100 0100 001 - DSPSR_EL0 = 0xda28, // 11 011 0100 0101 000 - DLR_EL0 = 0xda29, // 11 011 0100 0101 001 - IFSR32_EL2 = 0xe281, // 11 100 0101 0000 001 - AFSR0_EL1 = 0xc288, // 11 000 0101 0001 000 - AFSR0_EL2 = 0xe288, // 11 100 0101 0001 000 - AFSR0_EL3 = 0xf288, // 11 110 0101 0001 000 - AFSR1_EL1 = 0xc289, // 11 000 0101 0001 001 - AFSR1_EL2 = 0xe289, // 11 100 0101 0001 001 - AFSR1_EL3 = 0xf289, // 11 110 0101 0001 001 - ESR_EL1 = 0xc290, // 11 000 0101 0010 000 - ESR_EL2 = 0xe290, // 11 100 0101 0010 000 - ESR_EL3 = 0xf290, // 11 110 0101 0010 000 - FPEXC32_EL2 = 0xe298, // 11 100 0101 0011 000 - FAR_EL1 = 0xc300, // 11 000 0110 0000 000 - FAR_EL2 = 0xe300, // 11 100 0110 0000 000 - FAR_EL3 = 0xf300, // 11 110 0110 0000 000 - HPFAR_EL2 = 0xe304, // 11 100 0110 0000 100 - PAR_EL1 = 0xc3a0, // 11 000 0111 0100 000 - PMCR_EL0 = 0xdce0, // 11 011 1001 1100 000 - PMCNTENSET_EL0 = 0xdce1, // 11 011 1001 1100 001 - PMCNTENCLR_EL0 = 0xdce2, // 11 011 1001 1100 010 - PMOVSCLR_EL0 = 0xdce3, // 11 011 1001 1100 011 - PMSELR_EL0 = 0xdce5, // 11 011 1001 1100 101 - PMCCNTR_EL0 = 0xdce8, // 11 011 1001 1101 000 - PMXEVTYPER_EL0 = 0xdce9, // 11 011 1001 1101 001 - PMXEVCNTR_EL0 = 0xdcea, // 11 011 1001 1101 010 - PMUSERENR_EL0 = 0xdcf0, // 11 011 1001 1110 000 - PMINTENSET_EL1 = 0xc4f1, // 11 000 1001 1110 001 - PMINTENCLR_EL1 = 0xc4f2, // 11 000 1001 1110 010 - PMOVSSET_EL0 = 0xdcf3, // 11 011 1001 1110 011 - MAIR_EL1 = 0xc510, // 11 000 1010 0010 000 - MAIR_EL2 = 0xe510, // 11 100 1010 0010 000 - MAIR_EL3 = 0xf510, // 11 110 1010 0010 000 - AMAIR_EL1 = 0xc518, // 11 000 1010 0011 000 - AMAIR_EL2 = 0xe518, // 11 100 1010 0011 000 - AMAIR_EL3 = 0xf518, // 11 110 1010 0011 000 - VBAR_EL1 = 0xc600, // 11 000 1100 0000 000 - VBAR_EL2 = 0xe600, // 11 100 1100 0000 000 - VBAR_EL3 = 0xf600, // 11 110 1100 0000 000 - RMR_EL1 = 0xc602, // 11 000 1100 0000 010 - RMR_EL2 = 0xe602, // 11 100 1100 0000 010 - RMR_EL3 = 0xf602, // 11 110 1100 0000 010 - CONTEXTIDR_EL1 = 0xc681, // 11 000 1101 0000 001 - TPIDR_EL0 = 0xde82, // 11 011 1101 0000 010 - TPIDR_EL2 = 0xe682, // 11 100 1101 0000 010 - TPIDR_EL3 = 0xf682, // 11 110 1101 0000 010 - TPIDRRO_EL0 = 0xde83, // 11 011 1101 0000 011 - TPIDR_EL1 = 0xc684, // 11 000 1101 0000 100 - CNTFRQ_EL0 = 0xdf00, // 11 011 1110 0000 000 - CNTVOFF_EL2 = 0xe703, // 11 100 1110 0000 011 - CNTKCTL_EL1 = 0xc708, // 11 000 1110 0001 000 - CNTHCTL_EL2 = 0xe708, // 11 100 1110 0001 000 - CNTP_TVAL_EL0 = 0xdf10, // 11 011 1110 0010 000 - CNTHP_TVAL_EL2 = 0xe710, // 11 100 1110 0010 000 - CNTPS_TVAL_EL1 = 0xff10, // 11 111 1110 0010 000 - CNTP_CTL_EL0 = 0xdf11, // 11 011 1110 0010 001 - CNTHP_CTL_EL2 = 0xe711, // 11 100 1110 0010 001 - CNTPS_CTL_EL1 = 0xff11, // 11 111 1110 0010 001 - CNTP_CVAL_EL0 = 0xdf12, // 11 011 1110 0010 010 - CNTHP_CVAL_EL2 = 0xe712, // 11 100 1110 0010 010 - CNTPS_CVAL_EL1 = 0xff12, // 11 111 1110 0010 010 - CNTV_TVAL_EL0 = 0xdf18, // 11 011 1110 0011 000 - CNTV_CTL_EL0 = 0xdf19, // 11 011 1110 0011 001 - CNTV_CVAL_EL0 = 0xdf1a, // 11 011 1110 0011 010 - PMEVCNTR0_EL0 = 0xdf40, // 11 011 1110 1000 000 - PMEVCNTR1_EL0 = 0xdf41, // 11 011 1110 1000 001 - PMEVCNTR2_EL0 = 0xdf42, // 11 011 1110 1000 010 - PMEVCNTR3_EL0 = 0xdf43, // 11 011 1110 1000 011 - PMEVCNTR4_EL0 = 0xdf44, // 11 011 1110 1000 100 - PMEVCNTR5_EL0 = 0xdf45, // 11 011 1110 1000 101 - PMEVCNTR6_EL0 = 0xdf46, // 11 011 1110 1000 110 - PMEVCNTR7_EL0 = 0xdf47, // 11 011 1110 1000 111 - PMEVCNTR8_EL0 = 0xdf48, // 11 011 1110 1001 000 - PMEVCNTR9_EL0 = 0xdf49, // 11 011 1110 1001 001 - PMEVCNTR10_EL0 = 0xdf4a, // 11 011 1110 1001 010 - PMEVCNTR11_EL0 = 0xdf4b, // 11 011 1110 1001 011 - PMEVCNTR12_EL0 = 0xdf4c, // 11 011 1110 1001 100 - PMEVCNTR13_EL0 = 0xdf4d, // 11 011 1110 1001 101 - PMEVCNTR14_EL0 = 0xdf4e, // 11 011 1110 1001 110 - PMEVCNTR15_EL0 = 0xdf4f, // 11 011 1110 1001 111 - PMEVCNTR16_EL0 = 0xdf50, // 11 011 1110 1010 000 - PMEVCNTR17_EL0 = 0xdf51, // 11 011 1110 1010 001 - PMEVCNTR18_EL0 = 0xdf52, // 11 011 1110 1010 010 - PMEVCNTR19_EL0 = 0xdf53, // 11 011 1110 1010 011 - PMEVCNTR20_EL0 = 0xdf54, // 11 011 1110 1010 100 - PMEVCNTR21_EL0 = 0xdf55, // 11 011 1110 1010 101 - PMEVCNTR22_EL0 = 0xdf56, // 11 011 1110 1010 110 - PMEVCNTR23_EL0 = 0xdf57, // 11 011 1110 1010 111 - PMEVCNTR24_EL0 = 0xdf58, // 11 011 1110 1011 000 - PMEVCNTR25_EL0 = 0xdf59, // 11 011 1110 1011 001 - PMEVCNTR26_EL0 = 0xdf5a, // 11 011 1110 1011 010 - PMEVCNTR27_EL0 = 0xdf5b, // 11 011 1110 1011 011 - PMEVCNTR28_EL0 = 0xdf5c, // 11 011 1110 1011 100 - PMEVCNTR29_EL0 = 0xdf5d, // 11 011 1110 1011 101 - PMEVCNTR30_EL0 = 0xdf5e, // 11 011 1110 1011 110 - PMCCFILTR_EL0 = 0xdf7f, // 11 011 1110 1111 111 - PMEVTYPER0_EL0 = 0xdf60, // 11 011 1110 1100 000 - PMEVTYPER1_EL0 = 0xdf61, // 11 011 1110 1100 001 - PMEVTYPER2_EL0 = 0xdf62, // 11 011 1110 1100 010 - PMEVTYPER3_EL0 = 0xdf63, // 11 011 1110 1100 011 - PMEVTYPER4_EL0 = 0xdf64, // 11 011 1110 1100 100 - PMEVTYPER5_EL0 = 0xdf65, // 11 011 1110 1100 101 - PMEVTYPER6_EL0 = 0xdf66, // 11 011 1110 1100 110 - PMEVTYPER7_EL0 = 0xdf67, // 11 011 1110 1100 111 - PMEVTYPER8_EL0 = 0xdf68, // 11 011 1110 1101 000 - PMEVTYPER9_EL0 = 0xdf69, // 11 011 1110 1101 001 - PMEVTYPER10_EL0 = 0xdf6a, // 11 011 1110 1101 010 - PMEVTYPER11_EL0 = 0xdf6b, // 11 011 1110 1101 011 - PMEVTYPER12_EL0 = 0xdf6c, // 11 011 1110 1101 100 - PMEVTYPER13_EL0 = 0xdf6d, // 11 011 1110 1101 101 - PMEVTYPER14_EL0 = 0xdf6e, // 11 011 1110 1101 110 - PMEVTYPER15_EL0 = 0xdf6f, // 11 011 1110 1101 111 - PMEVTYPER16_EL0 = 0xdf70, // 11 011 1110 1110 000 - PMEVTYPER17_EL0 = 0xdf71, // 11 011 1110 1110 001 - PMEVTYPER18_EL0 = 0xdf72, // 11 011 1110 1110 010 - PMEVTYPER19_EL0 = 0xdf73, // 11 011 1110 1110 011 - PMEVTYPER20_EL0 = 0xdf74, // 11 011 1110 1110 100 - PMEVTYPER21_EL0 = 0xdf75, // 11 011 1110 1110 101 - PMEVTYPER22_EL0 = 0xdf76, // 11 011 1110 1110 110 - PMEVTYPER23_EL0 = 0xdf77, // 11 011 1110 1110 111 - PMEVTYPER24_EL0 = 0xdf78, // 11 011 1110 1111 000 - PMEVTYPER25_EL0 = 0xdf79, // 11 011 1110 1111 001 - PMEVTYPER26_EL0 = 0xdf7a, // 11 011 1110 1111 010 - PMEVTYPER27_EL0 = 0xdf7b, // 11 011 1110 1111 011 - PMEVTYPER28_EL0 = 0xdf7c, // 11 011 1110 1111 100 - PMEVTYPER29_EL0 = 0xdf7d, // 11 011 1110 1111 101 - PMEVTYPER30_EL0 = 0xdf7e, // 11 011 1110 1111 110 - - // Trace registers - TRCPRGCTLR = 0x8808, // 10 001 0000 0001 000 - TRCPROCSELR = 0x8810, // 10 001 0000 0010 000 - TRCCONFIGR = 0x8820, // 10 001 0000 0100 000 - TRCAUXCTLR = 0x8830, // 10 001 0000 0110 000 - TRCEVENTCTL0R = 0x8840, // 10 001 0000 1000 000 - TRCEVENTCTL1R = 0x8848, // 10 001 0000 1001 000 - TRCSTALLCTLR = 0x8858, // 10 001 0000 1011 000 - TRCTSCTLR = 0x8860, // 10 001 0000 1100 000 - TRCSYNCPR = 0x8868, // 10 001 0000 1101 000 - TRCCCCTLR = 0x8870, // 10 001 0000 1110 000 - TRCBBCTLR = 0x8878, // 10 001 0000 1111 000 - TRCTRACEIDR = 0x8801, // 10 001 0000 0000 001 - TRCQCTLR = 0x8809, // 10 001 0000 0001 001 - TRCVICTLR = 0x8802, // 10 001 0000 0000 010 - TRCVIIECTLR = 0x880a, // 10 001 0000 0001 010 - TRCVISSCTLR = 0x8812, // 10 001 0000 0010 010 - TRCVIPCSSCTLR = 0x881a, // 10 001 0000 0011 010 - TRCVDCTLR = 0x8842, // 10 001 0000 1000 010 - TRCVDSACCTLR = 0x884a, // 10 001 0000 1001 010 - TRCVDARCCTLR = 0x8852, // 10 001 0000 1010 010 - TRCSEQEVR0 = 0x8804, // 10 001 0000 0000 100 - TRCSEQEVR1 = 0x880c, // 10 001 0000 0001 100 - TRCSEQEVR2 = 0x8814, // 10 001 0000 0010 100 - TRCSEQRSTEVR = 0x8834, // 10 001 0000 0110 100 - TRCSEQSTR = 0x883c, // 10 001 0000 0111 100 - TRCEXTINSELR = 0x8844, // 10 001 0000 1000 100 - TRCCNTRLDVR0 = 0x8805, // 10 001 0000 0000 101 - TRCCNTRLDVR1 = 0x880d, // 10 001 0000 0001 101 - TRCCNTRLDVR2 = 0x8815, // 10 001 0000 0010 101 - TRCCNTRLDVR3 = 0x881d, // 10 001 0000 0011 101 - TRCCNTCTLR0 = 0x8825, // 10 001 0000 0100 101 - TRCCNTCTLR1 = 0x882d, // 10 001 0000 0101 101 - TRCCNTCTLR2 = 0x8835, // 10 001 0000 0110 101 - TRCCNTCTLR3 = 0x883d, // 10 001 0000 0111 101 - TRCCNTVR0 = 0x8845, // 10 001 0000 1000 101 - TRCCNTVR1 = 0x884d, // 10 001 0000 1001 101 - TRCCNTVR2 = 0x8855, // 10 001 0000 1010 101 - TRCCNTVR3 = 0x885d, // 10 001 0000 1011 101 - TRCIMSPEC0 = 0x8807, // 10 001 0000 0000 111 - TRCIMSPEC1 = 0x880f, // 10 001 0000 0001 111 - TRCIMSPEC2 = 0x8817, // 10 001 0000 0010 111 - TRCIMSPEC3 = 0x881f, // 10 001 0000 0011 111 - TRCIMSPEC4 = 0x8827, // 10 001 0000 0100 111 - TRCIMSPEC5 = 0x882f, // 10 001 0000 0101 111 - TRCIMSPEC6 = 0x8837, // 10 001 0000 0110 111 - TRCIMSPEC7 = 0x883f, // 10 001 0000 0111 111 - TRCRSCTLR2 = 0x8890, // 10 001 0001 0010 000 - TRCRSCTLR3 = 0x8898, // 10 001 0001 0011 000 - TRCRSCTLR4 = 0x88a0, // 10 001 0001 0100 000 - TRCRSCTLR5 = 0x88a8, // 10 001 0001 0101 000 - TRCRSCTLR6 = 0x88b0, // 10 001 0001 0110 000 - TRCRSCTLR7 = 0x88b8, // 10 001 0001 0111 000 - TRCRSCTLR8 = 0x88c0, // 10 001 0001 1000 000 - TRCRSCTLR9 = 0x88c8, // 10 001 0001 1001 000 - TRCRSCTLR10 = 0x88d0, // 10 001 0001 1010 000 - TRCRSCTLR11 = 0x88d8, // 10 001 0001 1011 000 - TRCRSCTLR12 = 0x88e0, // 10 001 0001 1100 000 - TRCRSCTLR13 = 0x88e8, // 10 001 0001 1101 000 - TRCRSCTLR14 = 0x88f0, // 10 001 0001 1110 000 - TRCRSCTLR15 = 0x88f8, // 10 001 0001 1111 000 - TRCRSCTLR16 = 0x8881, // 10 001 0001 0000 001 - TRCRSCTLR17 = 0x8889, // 10 001 0001 0001 001 - TRCRSCTLR18 = 0x8891, // 10 001 0001 0010 001 - TRCRSCTLR19 = 0x8899, // 10 001 0001 0011 001 - TRCRSCTLR20 = 0x88a1, // 10 001 0001 0100 001 - TRCRSCTLR21 = 0x88a9, // 10 001 0001 0101 001 - TRCRSCTLR22 = 0x88b1, // 10 001 0001 0110 001 - TRCRSCTLR23 = 0x88b9, // 10 001 0001 0111 001 - TRCRSCTLR24 = 0x88c1, // 10 001 0001 1000 001 - TRCRSCTLR25 = 0x88c9, // 10 001 0001 1001 001 - TRCRSCTLR26 = 0x88d1, // 10 001 0001 1010 001 - TRCRSCTLR27 = 0x88d9, // 10 001 0001 1011 001 - TRCRSCTLR28 = 0x88e1, // 10 001 0001 1100 001 - TRCRSCTLR29 = 0x88e9, // 10 001 0001 1101 001 - TRCRSCTLR30 = 0x88f1, // 10 001 0001 1110 001 - TRCRSCTLR31 = 0x88f9, // 10 001 0001 1111 001 - TRCSSCCR0 = 0x8882, // 10 001 0001 0000 010 - TRCSSCCR1 = 0x888a, // 10 001 0001 0001 010 - TRCSSCCR2 = 0x8892, // 10 001 0001 0010 010 - TRCSSCCR3 = 0x889a, // 10 001 0001 0011 010 - TRCSSCCR4 = 0x88a2, // 10 001 0001 0100 010 - TRCSSCCR5 = 0x88aa, // 10 001 0001 0101 010 - TRCSSCCR6 = 0x88b2, // 10 001 0001 0110 010 - TRCSSCCR7 = 0x88ba, // 10 001 0001 0111 010 - TRCSSCSR0 = 0x88c2, // 10 001 0001 1000 010 - TRCSSCSR1 = 0x88ca, // 10 001 0001 1001 010 - TRCSSCSR2 = 0x88d2, // 10 001 0001 1010 010 - TRCSSCSR3 = 0x88da, // 10 001 0001 1011 010 - TRCSSCSR4 = 0x88e2, // 10 001 0001 1100 010 - TRCSSCSR5 = 0x88ea, // 10 001 0001 1101 010 - TRCSSCSR6 = 0x88f2, // 10 001 0001 1110 010 - TRCSSCSR7 = 0x88fa, // 10 001 0001 1111 010 - TRCSSPCICR0 = 0x8883, // 10 001 0001 0000 011 - TRCSSPCICR1 = 0x888b, // 10 001 0001 0001 011 - TRCSSPCICR2 = 0x8893, // 10 001 0001 0010 011 - TRCSSPCICR3 = 0x889b, // 10 001 0001 0011 011 - TRCSSPCICR4 = 0x88a3, // 10 001 0001 0100 011 - TRCSSPCICR5 = 0x88ab, // 10 001 0001 0101 011 - TRCSSPCICR6 = 0x88b3, // 10 001 0001 0110 011 - TRCSSPCICR7 = 0x88bb, // 10 001 0001 0111 011 - TRCPDCR = 0x88a4, // 10 001 0001 0100 100 - TRCACVR0 = 0x8900, // 10 001 0010 0000 000 - TRCACVR1 = 0x8910, // 10 001 0010 0010 000 - TRCACVR2 = 0x8920, // 10 001 0010 0100 000 - TRCACVR3 = 0x8930, // 10 001 0010 0110 000 - TRCACVR4 = 0x8940, // 10 001 0010 1000 000 - TRCACVR5 = 0x8950, // 10 001 0010 1010 000 - TRCACVR6 = 0x8960, // 10 001 0010 1100 000 - TRCACVR7 = 0x8970, // 10 001 0010 1110 000 - TRCACVR8 = 0x8901, // 10 001 0010 0000 001 - TRCACVR9 = 0x8911, // 10 001 0010 0010 001 - TRCACVR10 = 0x8921, // 10 001 0010 0100 001 - TRCACVR11 = 0x8931, // 10 001 0010 0110 001 - TRCACVR12 = 0x8941, // 10 001 0010 1000 001 - TRCACVR13 = 0x8951, // 10 001 0010 1010 001 - TRCACVR14 = 0x8961, // 10 001 0010 1100 001 - TRCACVR15 = 0x8971, // 10 001 0010 1110 001 - TRCACATR0 = 0x8902, // 10 001 0010 0000 010 - TRCACATR1 = 0x8912, // 10 001 0010 0010 010 - TRCACATR2 = 0x8922, // 10 001 0010 0100 010 - TRCACATR3 = 0x8932, // 10 001 0010 0110 010 - TRCACATR4 = 0x8942, // 10 001 0010 1000 010 - TRCACATR5 = 0x8952, // 10 001 0010 1010 010 - TRCACATR6 = 0x8962, // 10 001 0010 1100 010 - TRCACATR7 = 0x8972, // 10 001 0010 1110 010 - TRCACATR8 = 0x8903, // 10 001 0010 0000 011 - TRCACATR9 = 0x8913, // 10 001 0010 0010 011 - TRCACATR10 = 0x8923, // 10 001 0010 0100 011 - TRCACATR11 = 0x8933, // 10 001 0010 0110 011 - TRCACATR12 = 0x8943, // 10 001 0010 1000 011 - TRCACATR13 = 0x8953, // 10 001 0010 1010 011 - TRCACATR14 = 0x8963, // 10 001 0010 1100 011 - TRCACATR15 = 0x8973, // 10 001 0010 1110 011 - TRCDVCVR0 = 0x8904, // 10 001 0010 0000 100 - TRCDVCVR1 = 0x8924, // 10 001 0010 0100 100 - TRCDVCVR2 = 0x8944, // 10 001 0010 1000 100 - TRCDVCVR3 = 0x8964, // 10 001 0010 1100 100 - TRCDVCVR4 = 0x8905, // 10 001 0010 0000 101 - TRCDVCVR5 = 0x8925, // 10 001 0010 0100 101 - TRCDVCVR6 = 0x8945, // 10 001 0010 1000 101 - TRCDVCVR7 = 0x8965, // 10 001 0010 1100 101 - TRCDVCMR0 = 0x8906, // 10 001 0010 0000 110 - TRCDVCMR1 = 0x8926, // 10 001 0010 0100 110 - TRCDVCMR2 = 0x8946, // 10 001 0010 1000 110 - TRCDVCMR3 = 0x8966, // 10 001 0010 1100 110 - TRCDVCMR4 = 0x8907, // 10 001 0010 0000 111 - TRCDVCMR5 = 0x8927, // 10 001 0010 0100 111 - TRCDVCMR6 = 0x8947, // 10 001 0010 1000 111 - TRCDVCMR7 = 0x8967, // 10 001 0010 1100 111 - TRCCIDCVR0 = 0x8980, // 10 001 0011 0000 000 - TRCCIDCVR1 = 0x8990, // 10 001 0011 0010 000 - TRCCIDCVR2 = 0x89a0, // 10 001 0011 0100 000 - TRCCIDCVR3 = 0x89b0, // 10 001 0011 0110 000 - TRCCIDCVR4 = 0x89c0, // 10 001 0011 1000 000 - TRCCIDCVR5 = 0x89d0, // 10 001 0011 1010 000 - TRCCIDCVR6 = 0x89e0, // 10 001 0011 1100 000 - TRCCIDCVR7 = 0x89f0, // 10 001 0011 1110 000 - TRCVMIDCVR0 = 0x8981, // 10 001 0011 0000 001 - TRCVMIDCVR1 = 0x8991, // 10 001 0011 0010 001 - TRCVMIDCVR2 = 0x89a1, // 10 001 0011 0100 001 - TRCVMIDCVR3 = 0x89b1, // 10 001 0011 0110 001 - TRCVMIDCVR4 = 0x89c1, // 10 001 0011 1000 001 - TRCVMIDCVR5 = 0x89d1, // 10 001 0011 1010 001 - TRCVMIDCVR6 = 0x89e1, // 10 001 0011 1100 001 - TRCVMIDCVR7 = 0x89f1, // 10 001 0011 1110 001 - TRCCIDCCTLR0 = 0x8982, // 10 001 0011 0000 010 - TRCCIDCCTLR1 = 0x898a, // 10 001 0011 0001 010 - TRCVMIDCCTLR0 = 0x8992, // 10 001 0011 0010 010 - TRCVMIDCCTLR1 = 0x899a, // 10 001 0011 0011 010 - TRCITCTRL = 0x8b84, // 10 001 0111 0000 100 - TRCCLAIMSET = 0x8bc6, // 10 001 0111 1000 110 - TRCCLAIMCLR = 0x8bce, // 10 001 0111 1001 110 - - // GICv3 registers - ICC_BPR1_EL1 = 0xc663, // 11 000 1100 1100 011 - ICC_BPR0_EL1 = 0xc643, // 11 000 1100 1000 011 - ICC_PMR_EL1 = 0xc230, // 11 000 0100 0110 000 - ICC_CTLR_EL1 = 0xc664, // 11 000 1100 1100 100 - ICC_CTLR_EL3 = 0xf664, // 11 110 1100 1100 100 - ICC_SRE_EL1 = 0xc665, // 11 000 1100 1100 101 - ICC_SRE_EL2 = 0xe64d, // 11 100 1100 1001 101 - ICC_SRE_EL3 = 0xf665, // 11 110 1100 1100 101 - ICC_IGRPEN0_EL1 = 0xc666, // 11 000 1100 1100 110 - ICC_IGRPEN1_EL1 = 0xc667, // 11 000 1100 1100 111 - ICC_IGRPEN1_EL3 = 0xf667, // 11 110 1100 1100 111 - ICC_SEIEN_EL1 = 0xc668, // 11 000 1100 1101 000 - ICC_AP0R0_EL1 = 0xc644, // 11 000 1100 1000 100 - ICC_AP0R1_EL1 = 0xc645, // 11 000 1100 1000 101 - ICC_AP0R2_EL1 = 0xc646, // 11 000 1100 1000 110 - ICC_AP0R3_EL1 = 0xc647, // 11 000 1100 1000 111 - ICC_AP1R0_EL1 = 0xc648, // 11 000 1100 1001 000 - ICC_AP1R1_EL1 = 0xc649, // 11 000 1100 1001 001 - ICC_AP1R2_EL1 = 0xc64a, // 11 000 1100 1001 010 - ICC_AP1R3_EL1 = 0xc64b, // 11 000 1100 1001 011 - ICH_AP0R0_EL2 = 0xe640, // 11 100 1100 1000 000 - ICH_AP0R1_EL2 = 0xe641, // 11 100 1100 1000 001 - ICH_AP0R2_EL2 = 0xe642, // 11 100 1100 1000 010 - ICH_AP0R3_EL2 = 0xe643, // 11 100 1100 1000 011 - ICH_AP1R0_EL2 = 0xe648, // 11 100 1100 1001 000 - ICH_AP1R1_EL2 = 0xe649, // 11 100 1100 1001 001 - ICH_AP1R2_EL2 = 0xe64a, // 11 100 1100 1001 010 - ICH_AP1R3_EL2 = 0xe64b, // 11 100 1100 1001 011 - ICH_HCR_EL2 = 0xe658, // 11 100 1100 1011 000 - ICH_MISR_EL2 = 0xe65a, // 11 100 1100 1011 010 - ICH_VMCR_EL2 = 0xe65f, // 11 100 1100 1011 111 - ICH_VSEIR_EL2 = 0xe64c, // 11 100 1100 1001 100 - ICH_LR0_EL2 = 0xe660, // 11 100 1100 1100 000 - ICH_LR1_EL2 = 0xe661, // 11 100 1100 1100 001 - ICH_LR2_EL2 = 0xe662, // 11 100 1100 1100 010 - ICH_LR3_EL2 = 0xe663, // 11 100 1100 1100 011 - ICH_LR4_EL2 = 0xe664, // 11 100 1100 1100 100 - ICH_LR5_EL2 = 0xe665, // 11 100 1100 1100 101 - ICH_LR6_EL2 = 0xe666, // 11 100 1100 1100 110 - ICH_LR7_EL2 = 0xe667, // 11 100 1100 1100 111 - ICH_LR8_EL2 = 0xe668, // 11 100 1100 1101 000 - ICH_LR9_EL2 = 0xe669, // 11 100 1100 1101 001 - ICH_LR10_EL2 = 0xe66a, // 11 100 1100 1101 010 - ICH_LR11_EL2 = 0xe66b, // 11 100 1100 1101 011 - ICH_LR12_EL2 = 0xe66c, // 11 100 1100 1101 100 - ICH_LR13_EL2 = 0xe66d, // 11 100 1100 1101 101 - ICH_LR14_EL2 = 0xe66e, // 11 100 1100 1101 110 - ICH_LR15_EL2 = 0xe66f, // 11 100 1100 1101 111 - - // v8.1a "Privileged Access Never" extension-specific system registers - PAN = 0xc213, // 11 000 0100 0010 011 - - // v8.1a "Limited Ordering Regions" extension-specific system registers - LORSA_EL1 = 0xc520, // 11 000 1010 0100 000 - LOREA_EL1 = 0xc521, // 11 000 1010 0100 001 - LORN_EL1 = 0xc522, // 11 000 1010 0100 010 - LORC_EL1 = 0xc523, // 11 000 1010 0100 011 - LORID_EL1 = 0xc527, // 11 000 1010 0100 111 - - // v8.1a "Virtualization host extensions" system registers - TTBR1_EL2 = 0xe101, // 11 100 0010 0000 001 - CONTEXTIDR_EL2 = 0xe681, // 11 100 1101 0000 001 - CNTHV_TVAL_EL2 = 0xe718, // 11 100 1110 0011 000 - CNTHV_CVAL_EL2 = 0xe71a, // 11 100 1110 0011 010 - CNTHV_CTL_EL2 = 0xe719, // 11 100 1110 0011 001 - SCTLR_EL12 = 0xe880, // 11 101 0001 0000 000 - CPACR_EL12 = 0xe882, // 11 101 0001 0000 010 - TTBR0_EL12 = 0xe900, // 11 101 0010 0000 000 - TTBR1_EL12 = 0xe901, // 11 101 0010 0000 001 - TCR_EL12 = 0xe902, // 11 101 0010 0000 010 - AFSR0_EL12 = 0xea88, // 11 101 0101 0001 000 - AFSR1_EL12 = 0xea89, // 11 101 0101 0001 001 - ESR_EL12 = 0xea90, // 11 101 0101 0010 000 - FAR_EL12 = 0xeb00, // 11 101 0110 0000 000 - MAIR_EL12 = 0xed10, // 11 101 1010 0010 000 - AMAIR_EL12 = 0xed18, // 11 101 1010 0011 000 - VBAR_EL12 = 0xee00, // 11 101 1100 0000 000 - CONTEXTIDR_EL12 = 0xee81, // 11 101 1101 0000 001 - CNTKCTL_EL12 = 0xef08, // 11 101 1110 0001 000 - CNTP_TVAL_EL02 = 0xef10, // 11 101 1110 0010 000 - CNTP_CTL_EL02 = 0xef11, // 11 101 1110 0010 001 - CNTP_CVAL_EL02 = 0xef12, // 11 101 1110 0010 010 - CNTV_TVAL_EL02 = 0xef18, // 11 101 1110 0011 000 - CNTV_CTL_EL02 = 0xef19, // 11 101 1110 0011 001 - CNTV_CVAL_EL02 = 0xef1a, // 11 101 1110 0011 010 - SPSR_EL12 = 0xea00, // 11 101 0100 0000 000 - ELR_EL12 = 0xea01, // 11 101 0100 0000 001 - - // v8.2a registers - UAO = 0xc214, // 11 000 0100 0010 100 - - // v8.2a "Statistical Profiling extension" registers - PMBLIMITR_EL1 = 0xc4d0, // 11 000 1001 1010 000 - PMBPTR_EL1 = 0xc4d1, // 11 000 1001 1010 001 - PMBSR_EL1 = 0xc4d3, // 11 000 1001 1010 011 - PMBIDR_EL1 = 0xc4d7, // 11 000 1001 1010 111 - PMSCR_EL2 = 0xe4c8, // 11 100 1001 1001 000 - PMSCR_EL12 = 0xecc8, // 11 101 1001 1001 000 - PMSCR_EL1 = 0xc4c8, // 11 000 1001 1001 000 - PMSICR_EL1 = 0xc4ca, // 11 000 1001 1001 010 - PMSIRR_EL1 = 0xc4cb, // 11 000 1001 1001 011 - PMSFCR_EL1 = 0xc4cc, // 11 000 1001 1001 100 - PMSEVFR_EL1 = 0xc4cd, // 11 000 1001 1001 101 - PMSLATFR_EL1 = 0xc4ce, // 11 000 1001 1001 110 - PMSIDR_EL1 = 0xc4cf, // 11 000 1001 1001 111 + struct SysReg { + const char *Name; + unsigned Encoding; + bool Readable; + bool Writeable; + FeatureBitset FeaturesRequired; - // Cyclone specific system registers - CPM_IOACC_CTL_EL3 = 0xff90, + bool haveFeatures(FeatureBitset ActiveFeatures) const { + return (FeaturesRequired & ActiveFeatures) == FeaturesRequired; + } }; - // Note that these do not inherit from AArch64NamedImmMapper. This class is - // sufficiently different in its behaviour that I don't believe it's worth - // burdening the common AArch64NamedImmMapper with abstractions only needed in - // this one case. - struct SysRegMapper { - static const AArch64NamedImmMapper::Mapping SysRegMappings[]; + #define GET_SYSREG_DECL + #include "AArch64GenSystemOperands.inc" - const AArch64NamedImmMapper::Mapping *InstMappings; - size_t NumInstMappings; + const SysReg *lookupSysRegByName(StringRef); + const SysReg *lookupSysRegByEncoding(uint16_t); - SysRegMapper() { } - uint32_t fromString(StringRef Name, const FeatureBitset& FeatureBits, - bool &Valid) const; - std::string toString(uint32_t Bits, const FeatureBitset& FeatureBits) const; - }; - - struct MSRMapper : SysRegMapper { - static const AArch64NamedImmMapper::Mapping MSRMappings[]; - MSRMapper(); - }; - - struct MRSMapper : SysRegMapper { - static const AArch64NamedImmMapper::Mapping MRSMappings[]; - MRSMapper(); - }; - - uint32_t ParseGenericRegister(StringRef Name, bool &Valid); + uint32_t parseGenericRegister(StringRef Name); + std::string genericRegisterString(uint32_t Bits); } namespace AArch64TLBI { - enum TLBIValues { - Invalid = -1, // Op0 Op1 CRn CRm Op2 - IPAS2E1IS = 0x6401, // 01 100 1000 0000 001 - IPAS2LE1IS = 0x6405, // 01 100 1000 0000 101 - VMALLE1IS = 0x4418, // 01 000 1000 0011 000 - ALLE2IS = 0x6418, // 01 100 1000 0011 000 - ALLE3IS = 0x7418, // 01 110 1000 0011 000 - VAE1IS = 0x4419, // 01 000 1000 0011 001 - VAE2IS = 0x6419, // 01 100 1000 0011 001 - VAE3IS = 0x7419, // 01 110 1000 0011 001 - ASIDE1IS = 0x441a, // 01 000 1000 0011 010 - VAAE1IS = 0x441b, // 01 000 1000 0011 011 - ALLE1IS = 0x641c, // 01 100 1000 0011 100 - VALE1IS = 0x441d, // 01 000 1000 0011 101 - VALE2IS = 0x641d, // 01 100 1000 0011 101 - VALE3IS = 0x741d, // 01 110 1000 0011 101 - VMALLS12E1IS = 0x641e, // 01 100 1000 0011 110 - VAALE1IS = 0x441f, // 01 000 1000 0011 111 - IPAS2E1 = 0x6421, // 01 100 1000 0100 001 - IPAS2LE1 = 0x6425, // 01 100 1000 0100 101 - VMALLE1 = 0x4438, // 01 000 1000 0111 000 - ALLE2 = 0x6438, // 01 100 1000 0111 000 - ALLE3 = 0x7438, // 01 110 1000 0111 000 - VAE1 = 0x4439, // 01 000 1000 0111 001 - VAE2 = 0x6439, // 01 100 1000 0111 001 - VAE3 = 0x7439, // 01 110 1000 0111 001 - ASIDE1 = 0x443a, // 01 000 1000 0111 010 - VAAE1 = 0x443b, // 01 000 1000 0111 011 - ALLE1 = 0x643c, // 01 100 1000 0111 100 - VALE1 = 0x443d, // 01 000 1000 0111 101 - VALE2 = 0x643d, // 01 100 1000 0111 101 - VALE3 = 0x743d, // 01 110 1000 0111 101 - VMALLS12E1 = 0x643e, // 01 100 1000 0111 110 - VAALE1 = 0x443f // 01 000 1000 0111 111 - }; - - struct TLBIMapper : AArch64NamedImmMapper { - const static Mapping TLBIMappings[]; - - TLBIMapper(); + struct TLBI { + const char *Name; + uint16_t Encoding; + bool NeedsReg; }; - - static inline bool NeedsRegister(TLBIValues Val) { - switch (Val) { - case VMALLE1IS: - case ALLE2IS: - case ALLE3IS: - case ALLE1IS: - case VMALLS12E1IS: - case VMALLE1: - case ALLE2: - case ALLE3: - case ALLE1: - case VMALLS12E1: - return false; - default: - return true; - } - } + #define GET_TLBI_DECL + #include "AArch64GenSystemOperands.inc" } namespace AArch64II { @@ -1379,12 +515,7 @@ namespace AArch64II { /// thread-local symbol. On Darwin, only one type of thread-local access /// exists (pre linker-relaxation), but on ELF the TLSModel used for the /// referee will affect interpretation. - MO_TLS = 0x40, - - /// MO_CONSTPOOL - This flag indicates that a symbol operand represents - /// the address of a constant pool entry for the symbol, rather than the - /// address of the symbol itself. - MO_CONSTPOOL = 0x80 + MO_TLS = 0x40 }; } // end namespace AArch64II diff --git a/lib/Target/AArch64/Utils/Makefile b/lib/Target/AArch64/Utils/Makefile deleted file mode 100644 index 0b80f82f2b99b..0000000000000 --- a/lib/Target/AArch64/Utils/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -##===- lib/Target/AArch64/Utils/Makefile -------------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMAArch64Utils - -# Hack: we need to include 'main' AArch64 target directory to grab private -# headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common |