diff options
Diffstat (limited to 'llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp')
-rw-r--r-- | llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp | 311 |
1 files changed, 270 insertions, 41 deletions
diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 12dddd29ca84..a84d23d3bb96 100644 --- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -32,6 +32,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" @@ -50,6 +51,7 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/Pass.h" #include "llvm/Support/Allocator.h" @@ -900,7 +902,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) { unsigned Offset = getMemoryOpOffset(*First); Register Base = getLoadStoreBaseOp(*First).getReg(); bool BaseKill = LatestMI->killsRegister(Base); - unsigned PredReg = 0; + Register PredReg; ARMCC::CondCodes Pred = getInstrPredicate(*First, PredReg); DebugLoc DL = First->getDebugLoc(); MachineInstr *Merged = nullptr; @@ -991,7 +993,7 @@ static bool mayCombineMisaligned(const TargetSubtargetInfo &STI, // Stack pointer alignment is out of the programmers control so we can trust // SP-relative loads/stores. if (getLoadStoreBaseOp(MI).getReg() == ARM::SP && - STI.getFrameLowering()->getTransientStackAlignment() >= 4) + STI.getFrameLowering()->getTransientStackAlign() >= Align(4)) return true; return false; } @@ -1183,8 +1185,8 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc, /// Check if the given instruction increments or decrements a register and /// return the amount it is incremented/decremented. Returns 0 if the CPSR flags /// generated by the instruction are possibly read as well. -static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg, - ARMCC::CondCodes Pred, unsigned PredReg) { +static int isIncrementOrDecrement(const MachineInstr &MI, Register Reg, + ARMCC::CondCodes Pred, Register PredReg) { bool CheckCPSRDef; int Scale; switch (MI.getOpcode()) { @@ -1201,7 +1203,7 @@ static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg, default: return 0; } - unsigned MIPredReg; + Register MIPredReg; if (MI.getOperand(0).getReg() != Reg || MI.getOperand(1).getReg() != Reg || getInstrPredicate(MI, MIPredReg) != Pred || @@ -1215,8 +1217,8 @@ static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg, /// Searches for an increment or decrement of \p Reg before \p MBBI. static MachineBasicBlock::iterator -findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg, - ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) { +findIncDecBefore(MachineBasicBlock::iterator MBBI, Register Reg, + ARMCC::CondCodes Pred, Register PredReg, int &Offset) { Offset = 0; MachineBasicBlock &MBB = *MBBI->getParent(); MachineBasicBlock::iterator BeginMBBI = MBB.begin(); @@ -1235,8 +1237,8 @@ findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg, /// Searches for a increment or decrement of \p Reg after \p MBBI. static MachineBasicBlock::iterator -findIncDecAfter(MachineBasicBlock::iterator MBBI, unsigned Reg, - ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) { +findIncDecAfter(MachineBasicBlock::iterator MBBI, Register Reg, + ARMCC::CondCodes Pred, Register PredReg, int &Offset) { Offset = 0; MachineBasicBlock &MBB = *MBBI->getParent(); MachineBasicBlock::iterator EndMBBI = MBB.end(); @@ -1270,7 +1272,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) { const MachineOperand &BaseOP = MI->getOperand(0); Register Base = BaseOP.getReg(); bool BaseKill = BaseOP.isKill(); - unsigned PredReg = 0; + Register PredReg; ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg); unsigned Opcode = MI->getOpcode(); DebugLoc DL = MI->getDebugLoc(); @@ -1383,6 +1385,38 @@ static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc, case ARM::t2STRi8: case ARM::t2STRi12: return ARM::t2STR_POST; + + case ARM::MVE_VLDRBS16: + return ARM::MVE_VLDRBS16_post; + case ARM::MVE_VLDRBS32: + return ARM::MVE_VLDRBS32_post; + case ARM::MVE_VLDRBU16: + return ARM::MVE_VLDRBU16_post; + case ARM::MVE_VLDRBU32: + return ARM::MVE_VLDRBU32_post; + case ARM::MVE_VLDRHS32: + return ARM::MVE_VLDRHS32_post; + case ARM::MVE_VLDRHU32: + return ARM::MVE_VLDRHU32_post; + case ARM::MVE_VLDRBU8: + return ARM::MVE_VLDRBU8_post; + case ARM::MVE_VLDRHU16: + return ARM::MVE_VLDRHU16_post; + case ARM::MVE_VLDRWU32: + return ARM::MVE_VLDRWU32_post; + case ARM::MVE_VSTRB16: + return ARM::MVE_VSTRB16_post; + case ARM::MVE_VSTRB32: + return ARM::MVE_VSTRB32_post; + case ARM::MVE_VSTRH32: + return ARM::MVE_VSTRH32_post; + case ARM::MVE_VSTRBU8: + return ARM::MVE_VSTRBU8_post; + case ARM::MVE_VSTRHU16: + return ARM::MVE_VSTRHU16_post; + case ARM::MVE_VSTRWU32: + return ARM::MVE_VSTRWU32_post; + default: llvm_unreachable("Unhandled opcode!"); } } @@ -1412,7 +1446,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { if (MI->getOperand(0).getReg() == Base) return false; - unsigned PredReg = 0; + Register PredReg; ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg); int Bytes = getLSMultipleTransferSize(MI); MachineBasicBlock &MBB = *MI->getParent(); @@ -1525,7 +1559,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const { if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base) return false; - unsigned PredReg; + Register PredReg; ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); MachineBasicBlock::iterator MBBI(MI); MachineBasicBlock &MBB = *MI.getParent(); @@ -1602,13 +1636,13 @@ static bool isMemoryOp(const MachineInstr &MI) { // Don't touch volatile memory accesses - we may be changing their order. // TODO: We could allow unordered and monotonic atomics here, but we need to - // make sure the resulting ldm/stm is correctly marked as atomic. + // make sure the resulting ldm/stm is correctly marked as atomic. if (MMO.isVolatile() || MMO.isAtomic()) return false; // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is // not. - if (MMO.getAlignment() < 4) + if (MMO.getAlign() < Align(4)) return false; // str <undef> could probably be eliminated entirely, but for now we just want @@ -1692,7 +1726,7 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, assert((isT2 || MI->getOperand(3).getReg() == ARM::NoRegister) && "register offset not handled below"); int OffImm = getMemoryOpOffset(*MI); - unsigned PredReg = 0; + Register PredReg; ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg); if (OddRegNum > EvenRegNum && OffImm == 0) { @@ -1792,7 +1826,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { const MachineOperand &MO = MBBI->getOperand(0); Register Reg = MO.getReg(); Register Base = getLoadStoreBaseOp(*MBBI).getReg(); - unsigned PredReg = 0; + Register PredReg; ARMCC::CondCodes Pred = getInstrPredicate(*MBBI, PredReg); int Offset = getMemoryOpOffset(*MBBI); if (CurrBase == 0) { @@ -2046,6 +2080,7 @@ namespace { const TargetRegisterInfo *TRI; const ARMSubtarget *STI; MachineRegisterInfo *MRI; + MachineDominatorTree *DT; MachineFunction *MF; ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {} @@ -2058,29 +2093,34 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AAResultsWrapperPass>(); + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); MachineFunctionPass::getAnalysisUsage(AU); } private: bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl, - unsigned &NewOpc, unsigned &EvenReg, - unsigned &OddReg, unsigned &BaseReg, - int &Offset, - unsigned &PredReg, ARMCC::CondCodes &Pred, - bool &isT2); + unsigned &NewOpc, Register &EvenReg, Register &OddReg, + Register &BaseReg, int &Offset, Register &PredReg, + ARMCC::CondCodes &Pred, bool &isT2); bool RescheduleOps(MachineBasicBlock *MBB, SmallVectorImpl<MachineInstr *> &Ops, unsigned Base, bool isLd, DenseMap<MachineInstr*, unsigned> &MI2LocMap); bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB); + bool DistributeIncrements(); + bool DistributeIncrements(Register Base); }; } // end anonymous namespace char ARMPreAllocLoadStoreOpt::ID = 0; -INITIALIZE_PASS(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt", - ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false) +INITIALIZE_PASS_BEGIN(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt", + ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt", + ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false) // Limit the number of instructions to be rescheduled. // FIXME: tune this limit, and/or come up with some better heuristics. @@ -2096,10 +2136,11 @@ bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { TII = STI->getInstrInfo(); TRI = STI->getRegisterInfo(); MRI = &Fn.getRegInfo(); + DT = &getAnalysis<MachineDominatorTree>(); MF = &Fn; AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); - bool Modified = false; + bool Modified = DistributeIncrements(); for (MachineBasicBlock &MFI : Fn) Modified |= RescheduleLoadStoreInstrs(&MFI); @@ -2143,15 +2184,10 @@ static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base, return AddedRegPressure.size() <= MemRegs.size() * 2; } -bool -ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, - DebugLoc &dl, unsigned &NewOpc, - unsigned &FirstReg, - unsigned &SecondReg, - unsigned &BaseReg, int &Offset, - unsigned &PredReg, - ARMCC::CondCodes &Pred, - bool &isT2) { +bool ARMPreAllocLoadStoreOpt::CanFormLdStDWord( + MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl, unsigned &NewOpc, + Register &FirstReg, Register &SecondReg, Register &BaseReg, int &Offset, + Register &PredReg, ARMCC::CondCodes &Pred, bool &isT2) { // Make sure we're allowed to generate LDRD/STRD. if (!STI->hasV5TEOps()) return false; @@ -2183,12 +2219,12 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, (*Op0->memoperands_begin())->isAtomic()) return false; - unsigned Align = (*Op0->memoperands_begin())->getAlignment(); + Align Alignment = (*Op0->memoperands_begin())->getAlign(); const Function &Func = MF->getFunction(); - unsigned ReqAlign = STI->hasV6Ops() - ? TD->getABITypeAlignment(Type::getInt64Ty(Func.getContext())) - : 8; // Pre-v6 need 8-byte align - if (Align < ReqAlign) + Align ReqAlign = + STI->hasV6Ops() ? TD->getABITypeAlign(Type::getInt64Ty(Func.getContext())) + : Align(8); // Pre-v6 need 8-byte align + if (Alignment < ReqAlign) return false; // Then make sure the immediate offset fits. @@ -2313,8 +2349,8 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, // to try to allocate a pair of registers that can form register pairs. MachineInstr *Op0 = Ops.back(); MachineInstr *Op1 = Ops[Ops.size()-2]; - unsigned FirstReg = 0, SecondReg = 0; - unsigned BaseReg = 0, PredReg = 0; + Register FirstReg, SecondReg; + Register BaseReg, PredReg; ARMCC::CondCodes Pred = ARMCC::AL; bool isT2 = false; unsigned NewOpc = 0; @@ -2416,7 +2452,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { if (!isMemoryOp(MI)) continue; - unsigned PredReg = 0; + Register PredReg; if (getInstrPredicate(MI, PredReg) != ARMCC::AL) continue; @@ -2482,6 +2518,199 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { return RetVal; } +// Get the Base register operand index from the memory access MachineInst if we +// should attempt to distribute postinc on it. Return -1 if not of a valid +// instruction type. If it returns an index, it is assumed that instruction is a +// r+i indexing mode, and getBaseOperandIndex() + 1 is the Offset index. +static int getBaseOperandIndex(MachineInstr &MI) { + switch (MI.getOpcode()) { + case ARM::MVE_VLDRBS16: + case ARM::MVE_VLDRBS32: + case ARM::MVE_VLDRBU16: + case ARM::MVE_VLDRBU32: + case ARM::MVE_VLDRHS32: + case ARM::MVE_VLDRHU32: + case ARM::MVE_VLDRBU8: + case ARM::MVE_VLDRHU16: + case ARM::MVE_VLDRWU32: + case ARM::MVE_VSTRB16: + case ARM::MVE_VSTRB32: + case ARM::MVE_VSTRH32: + case ARM::MVE_VSTRBU8: + case ARM::MVE_VSTRHU16: + case ARM::MVE_VSTRWU32: + return 1; + } + return -1; +} + +static MachineInstr *createPostIncLoadStore(MachineInstr *MI, int Offset, + Register NewReg, + const TargetInstrInfo *TII, + const TargetRegisterInfo *TRI) { + MachineFunction *MF = MI->getMF(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + unsigned NewOpcode = getPostIndexedLoadStoreOpcode( + MI->getOpcode(), Offset > 0 ? ARM_AM::add : ARM_AM::sub); + + const MCInstrDesc &MCID = TII->get(NewOpcode); + // Constrain the def register class + const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF); + MRI.constrainRegClass(NewReg, TRC); + // And do the same for the base operand + TRC = TII->getRegClass(MCID, 2, TRI, *MF); + MRI.constrainRegClass(MI->getOperand(1).getReg(), TRC); + + return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID) + .addReg(NewReg, RegState::Define) + .add(MI->getOperand(0)) + .add(MI->getOperand(1)) + .addImm(Offset) + .add(MI->getOperand(3)) + .add(MI->getOperand(4)) + .cloneMemRefs(*MI); +} + +// Given a Base Register, optimise the load/store uses to attempt to create more +// post-inc accesses. We do this by taking zero offset loads/stores with an add, +// and convert them to a postinc load/store of the same type. Any subsequent +// accesses will be adjusted to use and account for the post-inc value. +// For example: +// LDR #0 LDR_POSTINC #16 +// LDR #4 LDR #-12 +// LDR #8 LDR #-8 +// LDR #12 LDR #-4 +// ADD #16 +bool ARMPreAllocLoadStoreOpt::DistributeIncrements(Register Base) { + // We are looking for: + // One zero offset load/store that can become postinc + MachineInstr *BaseAccess = nullptr; + // An increment that can be folded in + MachineInstr *Increment = nullptr; + // Other accesses after BaseAccess that will need to be updated to use the + // postinc value + SmallPtrSet<MachineInstr *, 8> OtherAccesses; + for (auto &Use : MRI->use_nodbg_instructions(Base)) { + if (!Increment && getAddSubImmediate(Use) != 0) { + Increment = &Use; + continue; + } + + int BaseOp = getBaseOperandIndex(Use); + if (BaseOp == -1) + return false; + + if (!Use.getOperand(BaseOp).isReg() || + Use.getOperand(BaseOp).getReg() != Base) + return false; + if (Use.getOperand(BaseOp + 1).getImm() == 0) + BaseAccess = &Use; + else + OtherAccesses.insert(&Use); + } + + if (!BaseAccess || !Increment || + BaseAccess->getParent() != Increment->getParent()) + return false; + Register PredReg; + if (Increment->definesRegister(ARM::CPSR) || + getInstrPredicate(*Increment, PredReg) != ARMCC::AL) + return false; + + LLVM_DEBUG(dbgs() << "\nAttempting to distribute increments on VirtualReg " + << Base.virtRegIndex() << "\n"); + + // Make sure that Increment has no uses before BaseAccess. + for (MachineInstr &Use : + MRI->use_nodbg_instructions(Increment->getOperand(0).getReg())) { + if (!DT->dominates(BaseAccess, &Use) || &Use == BaseAccess) { + LLVM_DEBUG(dbgs() << " BaseAccess doesn't dominate use of increment\n"); + return false; + } + } + + // Make sure that Increment can be folded into Base + int IncrementOffset = getAddSubImmediate(*Increment); + unsigned NewPostIncOpcode = getPostIndexedLoadStoreOpcode( + BaseAccess->getOpcode(), IncrementOffset > 0 ? ARM_AM::add : ARM_AM::sub); + if (!isLegalAddressImm(NewPostIncOpcode, IncrementOffset, TII)) { + LLVM_DEBUG(dbgs() << " Illegal addressing mode immediate on postinc\n"); + return false; + } + + // And make sure that the negative value of increment can be added to all + // other offsets after the BaseAccess. We rely on either + // dominates(BaseAccess, OtherAccess) or dominates(OtherAccess, BaseAccess) + // to keep things simple. + SmallPtrSet<MachineInstr *, 4> SuccessorAccesses; + for (auto *Use : OtherAccesses) { + if (DT->dominates(BaseAccess, Use)) { + SuccessorAccesses.insert(Use); + unsigned BaseOp = getBaseOperandIndex(*Use); + if (!isLegalAddressImm( + Use->getOpcode(), + Use->getOperand(BaseOp + 1).getImm() - IncrementOffset, TII)) { + LLVM_DEBUG(dbgs() << " Illegal addressing mode immediate on use\n"); + return false; + } + } else if (!DT->dominates(Use, BaseAccess)) { + LLVM_DEBUG( + dbgs() << " Unknown dominance relation between Base and Use\n"); + return false; + } + } + + // Replace BaseAccess with a post inc + LLVM_DEBUG(dbgs() << "Changing: "; BaseAccess->dump()); + LLVM_DEBUG(dbgs() << " And : "; Increment->dump()); + Register NewBaseReg = Increment->getOperand(0).getReg(); + MachineInstr *BaseAccessPost = + createPostIncLoadStore(BaseAccess, IncrementOffset, NewBaseReg, TII, TRI); + BaseAccess->eraseFromParent(); + Increment->eraseFromParent(); + (void)BaseAccessPost; + LLVM_DEBUG(dbgs() << " To : "; BaseAccessPost->dump()); + + for (auto *Use : SuccessorAccesses) { + LLVM_DEBUG(dbgs() << "Changing: "; Use->dump()); + unsigned BaseOp = getBaseOperandIndex(*Use); + Use->getOperand(BaseOp).setReg(NewBaseReg); + int OldOffset = Use->getOperand(BaseOp + 1).getImm(); + Use->getOperand(BaseOp + 1).setImm(OldOffset - IncrementOffset); + LLVM_DEBUG(dbgs() << " To : "; Use->dump()); + } + + // Remove the kill flag from all uses of NewBaseReg, in case any old uses + // remain. + for (MachineOperand &Op : MRI->use_nodbg_operands(NewBaseReg)) + Op.setIsKill(false); + return true; +} + +bool ARMPreAllocLoadStoreOpt::DistributeIncrements() { + bool Changed = false; + SmallSetVector<Register, 4> Visited; + for (auto &MBB : *MF) { + for (auto &MI : MBB) { + int BaseOp = getBaseOperandIndex(MI); + if (BaseOp == -1 || !MI.getOperand(BaseOp).isReg()) + continue; + + Register Base = MI.getOperand(BaseOp).getReg(); + if (!Base.isVirtual() || Visited.count(Base)) + continue; + + Visited.insert(Base); + } + } + + for (auto Base : Visited) + Changed |= DistributeIncrements(Base); + + return Changed; +} + /// Returns an instance of the load / store optimization pass. FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) { if (PreAlloc) |