diff options
Diffstat (limited to 'lib/Target/AMDGPU/SILoadStoreOptimizer.cpp')
| -rw-r--r-- | lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 95 | 
1 files changed, 59 insertions, 36 deletions
diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 84cd47a101a8..4b537540046f 100644 --- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -45,6 +45,7 @@  #include "AMDGPUSubtarget.h"  #include "SIInstrInfo.h"  #include "SIRegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h"  #include "Utils/AMDGPUBaseInfo.h"  #include "llvm/ADT/ArrayRef.h"  #include "llvm/ADT/SmallVector.h" @@ -102,7 +103,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {     };  private: -  const SISubtarget *STM = nullptr; +  const GCNSubtarget *STM = nullptr;    const SIInstrInfo *TII = nullptr;    const SIRegisterInfo *TRI = nullptr;    MachineRegisterInfo *MRI = nullptr; @@ -137,7 +138,7 @@ public:    bool runOnMachineFunction(MachineFunction &MF) override; -  StringRef getPassName() const override { return "SI Load / Store Optimizer"; } +  StringRef getPassName() const override { return "SI Load Store Optimizer"; }    void getAnalysisUsage(AnalysisUsage &AU) const override {      AU.setPreservesCFG(); @@ -150,10 +151,10 @@ public:  } // end anonymous namespace.  INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, -                      "SI Load / Store Optimizer", false, false) +                      "SI Load Store Optimizer", false, false)  INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)  INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, -                    "SI Load / Store Optimizer", false, false) +                    "SI Load Store Optimizer", false, false)  char SILoadStoreOptimizer::ID = 0; @@ -173,10 +174,18 @@ static void moveInstsAfter(MachineBasicBlock::iterator I,    }  } -static void addDefsToList(const MachineInstr &MI, DenseSet<unsigned> &Defs) { -  // XXX: Should this be looking for implicit defs? -  for (const MachineOperand &Def : MI.defs()) -    Defs.insert(Def.getReg()); +static void addDefsUsesToList(const MachineInstr &MI, +                              DenseSet<unsigned> &RegDefs, +                              DenseSet<unsigned> &PhysRegUses) { +  for (const MachineOperand &Op : MI.operands()) { +    if (Op.isReg()) { +      if (Op.isDef()) +        RegDefs.insert(Op.getReg()); +      else if (Op.readsReg() && +               TargetRegisterInfo::isPhysicalRegister(Op.getReg())) +        PhysRegUses.insert(Op.getReg()); +    } +  }  }  static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, @@ -194,16 +203,24 @@ static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,  // already in the list. Returns true in that case.  static bool  addToListsIfDependent(MachineInstr &MI, -                      DenseSet<unsigned> &Defs, +                      DenseSet<unsigned> &RegDefs, +                      DenseSet<unsigned> &PhysRegUses,                        SmallVectorImpl<MachineInstr*> &Insts) {    for (MachineOperand &Use : MI.operands()) {      // If one of the defs is read, then there is a use of Def between I and the      // instruction that I will potentially be merged with. We will need to move      // this instruction after the merged instructions. - -    if (Use.isReg() && Use.readsReg() && Defs.count(Use.getReg())) { +    // +    // Similarly, if there is a def which is read by an instruction that is to +    // be moved for merging, then we need to move the def-instruction as well. +    // This can only happen for physical registers such as M0; virtual +    // registers are in SSA form. +    if (Use.isReg() && +        ((Use.readsReg() && RegDefs.count(Use.getReg())) || +         (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) && +          PhysRegUses.count(Use.getReg())))) {        Insts.push_back(&MI); -      addDefsToList(MI, Defs); +      addDefsUsesToList(MI, RegDefs, PhysRegUses);        return true;      }    } @@ -332,8 +349,9 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {    ++MBBI; -  DenseSet<unsigned> DefsToMove; -  addDefsToList(*CI.I, DefsToMove); +  DenseSet<unsigned> RegDefsToMove; +  DenseSet<unsigned> PhysRegUsesToMove; +  addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);    for ( ; MBBI != E; ++MBBI) {      if (MBBI->getOpcode() != CI.I->getOpcode()) { @@ -356,14 +374,15 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {          // #2.  Add this instruction to the move list and then we will check          // if condition #2 holds once we have selected the matching instruction.          CI.InstsToMove.push_back(&*MBBI); -        addDefsToList(*MBBI, DefsToMove); +        addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);          continue;        }        // When we match I with another DS instruction we will be moving I down        // to the location of the matched instruction any uses of I will need to        // be moved down as well. -      addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove); +      addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, +                            CI.InstsToMove);        continue;      } @@ -377,7 +396,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {      //   DS_WRITE_B32 addr, f(w), idx1      // where the DS_READ_B32 ends up in InstsToMove and therefore prevents      // merging of the two writes. -    if (addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove)) +    if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, +                              CI.InstsToMove))        continue;      bool Match = true; @@ -436,7 +456,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {      // down past this instruction.      // check if we can move I across MBBI and if we can move all I's users      if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || -      !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) +        !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))        break;    }    return false; @@ -496,13 +516,15 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(    unsigned BaseReg = AddrReg->getReg();    unsigned BaseRegFlags = 0;    if (CI.BaseOff) { +    unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); +    BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) +      .addImm(CI.BaseOff); +      BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);      BaseRegFlags = RegState::Kill; -    unsigned AddOpc = STM->hasAddNoCarry() ? -      AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32; -    BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg) -      .addImm(CI.BaseOff) +    TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) +      .addReg(ImmReg)        .addReg(AddrReg->getReg());    } @@ -532,7 +554,7 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(    CI.I->eraseFromParent();    CI.Paired->eraseFromParent(); -  DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); +  LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');    return Next;  } @@ -556,7 +578,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(    // Be sure to use .addOperand(), and not .addReg() with these. We want to be    // sure we preserve the subregister index and any register flags set on them. -  const MachineOperand *Addr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); +  const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);    const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);    const MachineOperand *Data1      = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0); @@ -579,17 +601,19 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(    const MCInstrDesc &Write2Desc = TII->get(Opc);    DebugLoc DL = CI.I->getDebugLoc(); -  unsigned BaseReg = Addr->getReg(); +  unsigned BaseReg = AddrReg->getReg();    unsigned BaseRegFlags = 0;    if (CI.BaseOff) { +    unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); +    BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) +      .addImm(CI.BaseOff); +      BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);      BaseRegFlags = RegState::Kill; -    unsigned AddOpc = STM->hasAddNoCarry() ? -      AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32; -    BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg) -      .addImm(CI.BaseOff) -      .addReg(Addr->getReg()); +    TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) +      .addReg(ImmReg) +      .addReg(AddrReg->getReg());    }    MachineInstrBuilder Write2 = @@ -608,7 +632,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(    CI.I->eraseFromParent();    CI.Paired->eraseFromParent(); -  DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); +  LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');    return Next;  } @@ -849,9 +873,8 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {        continue;      } -    if (STM->hasSBufferLoadStoreAtomicDwordxN() && -        (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM || -         Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM)) { +    if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM || +        Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) {        // EltSize is in units of the offset encoding.        CI.InstClass = S_BUFFER_LOAD_IMM;        CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4); @@ -916,7 +939,7 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {    if (skipFunction(MF.getFunction()))      return false; -  STM = &MF.getSubtarget<SISubtarget>(); +  STM = &MF.getSubtarget<GCNSubtarget>();    if (!STM->loadStoreOptEnabled())      return false; @@ -928,7 +951,7 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {    assert(MRI->isSSA() && "Must be run on SSA"); -  DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); +  LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");    bool Modified = false;  | 
