diff options
Diffstat (limited to 'lib/Target/AMDGPU/SILoadStoreOptimizer.cpp')
-rw-r--r-- | lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 95 |
1 files changed, 59 insertions, 36 deletions
diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 84cd47a101a8..4b537540046f 100644 --- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -45,6 +45,7 @@ #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIRegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" @@ -102,7 +103,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass { }; private: - const SISubtarget *STM = nullptr; + const GCNSubtarget *STM = nullptr; const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI = nullptr; MachineRegisterInfo *MRI = nullptr; @@ -137,7 +138,7 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; - StringRef getPassName() const override { return "SI Load / Store Optimizer"; } + StringRef getPassName() const override { return "SI Load Store Optimizer"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); @@ -150,10 +151,10 @@ public: } // end anonymous namespace. INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, - "SI Load / Store Optimizer", false, false) + "SI Load Store Optimizer", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, - "SI Load / Store Optimizer", false, false) + "SI Load Store Optimizer", false, false) char SILoadStoreOptimizer::ID = 0; @@ -173,10 +174,18 @@ static void moveInstsAfter(MachineBasicBlock::iterator I, } } -static void addDefsToList(const MachineInstr &MI, DenseSet<unsigned> &Defs) { - // XXX: Should this be looking for implicit defs? - for (const MachineOperand &Def : MI.defs()) - Defs.insert(Def.getReg()); +static void addDefsUsesToList(const MachineInstr &MI, + DenseSet<unsigned> &RegDefs, + DenseSet<unsigned> &PhysRegUses) { + for (const MachineOperand &Op : MI.operands()) { + if (Op.isReg()) { + if (Op.isDef()) + RegDefs.insert(Op.getReg()); + else if (Op.readsReg() && + TargetRegisterInfo::isPhysicalRegister(Op.getReg())) + PhysRegUses.insert(Op.getReg()); + } + } } static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, @@ -194,16 +203,24 @@ static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, // already in the list. Returns true in that case. static bool addToListsIfDependent(MachineInstr &MI, - DenseSet<unsigned> &Defs, + DenseSet<unsigned> &RegDefs, + DenseSet<unsigned> &PhysRegUses, SmallVectorImpl<MachineInstr*> &Insts) { for (MachineOperand &Use : MI.operands()) { // If one of the defs is read, then there is a use of Def between I and the // instruction that I will potentially be merged with. We will need to move // this instruction after the merged instructions. - - if (Use.isReg() && Use.readsReg() && Defs.count(Use.getReg())) { + // + // Similarly, if there is a def which is read by an instruction that is to + // be moved for merging, then we need to move the def-instruction as well. + // This can only happen for physical registers such as M0; virtual + // registers are in SSA form. + if (Use.isReg() && + ((Use.readsReg() && RegDefs.count(Use.getReg())) || + (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) && + PhysRegUses.count(Use.getReg())))) { Insts.push_back(&MI); - addDefsToList(MI, Defs); + addDefsUsesToList(MI, RegDefs, PhysRegUses); return true; } } @@ -332,8 +349,9 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { ++MBBI; - DenseSet<unsigned> DefsToMove; - addDefsToList(*CI.I, DefsToMove); + DenseSet<unsigned> RegDefsToMove; + DenseSet<unsigned> PhysRegUsesToMove; + addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); for ( ; MBBI != E; ++MBBI) { if (MBBI->getOpcode() != CI.I->getOpcode()) { @@ -356,14 +374,15 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { // #2. Add this instruction to the move list and then we will check // if condition #2 holds once we have selected the matching instruction. CI.InstsToMove.push_back(&*MBBI); - addDefsToList(*MBBI, DefsToMove); + addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove); continue; } // When we match I with another DS instruction we will be moving I down // to the location of the matched instruction any uses of I will need to // be moved down as well. - addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove); + addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, + CI.InstsToMove); continue; } @@ -377,7 +396,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { // DS_WRITE_B32 addr, f(w), idx1 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents // merging of the two writes. - if (addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove)) + if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, + CI.InstsToMove)) continue; bool Match = true; @@ -436,7 +456,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { // down past this instruction. // check if we can move I across MBBI and if we can move all I's users if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || - !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) + !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) break; } return false; @@ -496,13 +516,15 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( unsigned BaseReg = AddrReg->getReg(); unsigned BaseRegFlags = 0; if (CI.BaseOff) { + unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); + BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) + .addImm(CI.BaseOff); + BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; - unsigned AddOpc = STM->hasAddNoCarry() ? - AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32; - BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg) - .addImm(CI.BaseOff) + TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) + .addReg(ImmReg) .addReg(AddrReg->getReg()); } @@ -532,7 +554,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); - DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); + LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); return Next; } @@ -556,7 +578,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( // Be sure to use .addOperand(), and not .addReg() with these. We want to be // sure we preserve the subregister index and any register flags set on them. - const MachineOperand *Addr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); + const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); const MachineOperand *Data1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0); @@ -579,17 +601,19 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( const MCInstrDesc &Write2Desc = TII->get(Opc); DebugLoc DL = CI.I->getDebugLoc(); - unsigned BaseReg = Addr->getReg(); + unsigned BaseReg = AddrReg->getReg(); unsigned BaseRegFlags = 0; if (CI.BaseOff) { + unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); + BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) + .addImm(CI.BaseOff); + BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; - unsigned AddOpc = STM->hasAddNoCarry() ? - AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32; - BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg) - .addImm(CI.BaseOff) - .addReg(Addr->getReg()); + TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) + .addReg(ImmReg) + .addReg(AddrReg->getReg()); } MachineInstrBuilder Write2 = @@ -608,7 +632,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); - DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); + LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); return Next; } @@ -849,9 +873,8 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { continue; } - if (STM->hasSBufferLoadStoreAtomicDwordxN() && - (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM || - Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM)) { + if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM || + Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) { // EltSize is in units of the offset encoding. CI.InstClass = S_BUFFER_LOAD_IMM; CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4); @@ -916,7 +939,7 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; - STM = &MF.getSubtarget<SISubtarget>(); + STM = &MF.getSubtarget<GCNSubtarget>(); if (!STM->loadStoreOptEnabled()) return false; @@ -928,7 +951,7 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { assert(MRI->isSSA() && "Must be run on SSA"); - DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); + LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); bool Modified = false; |