diff options
Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp')
| -rw-r--r-- | contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 1062 | 
1 files changed, 816 insertions, 246 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 4b537540046f..be291b127301 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -20,6 +20,26 @@  // ==>  //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4  // +// This pass also tries to promote constant offset to the immediate by +// adjusting the base. It tries to use a base from the nearby instructions that +// allows it to have a 13bit constant offset and then promotes the 13bit offset +// to the immediate. +// E.g. +//  s_movk_i32 s0, 0x1800 +//  v_add_co_u32_e32 v0, vcc, s0, v2 +//  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc +// +//  s_movk_i32 s0, 0x1000 +//  v_add_co_u32_e32 v5, vcc, s0, v2 +//  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +//  global_load_dwordx2 v[5:6], v[5:6], off +//  global_load_dwordx2 v[0:1], v[0:1], off +// => +//  s_movk_i32 s0, 0x1000 +//  v_add_co_u32_e32 v5, vcc, s0, v2 +//  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +//  global_load_dwordx2 v[5:6], v[5:6], off +//  global_load_dwordx2 v[0:1], v[5:6], off offset:2048  //  // Future improvements:  // @@ -43,9 +63,9 @@  #include "AMDGPU.h"  #include "AMDGPUSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h"  #include "SIInstrInfo.h"  #include "SIRegisterInfo.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h"  #include "Utils/AMDGPUBaseInfo.h"  #include "llvm/ADT/ArrayRef.h"  #include "llvm/ADT/SmallVector.h" @@ -74,23 +94,38 @@ using namespace llvm;  #define DEBUG_TYPE "si-load-store-opt"  namespace { +enum InstClassEnum { +  UNKNOWN, +  DS_READ, +  DS_WRITE, +  S_BUFFER_LOAD_IMM, +  BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN, +  BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET, +  BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN, +  BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET, +  BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact, +  BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact, +  BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact, +  BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact, +}; -class SILoadStoreOptimizer : public MachineFunctionPass { -  enum InstClassEnum { -    DS_READ_WRITE, -    S_BUFFER_LOAD_IMM, -    BUFFER_LOAD_OFFEN, -    BUFFER_LOAD_OFFSET, -    BUFFER_STORE_OFFEN, -    BUFFER_STORE_OFFSET, -  }; +enum RegisterEnum { +  SBASE = 0x1, +  SRSRC = 0x2, +  SOFFSET = 0x4, +  VADDR = 0x8, +  ADDR = 0x10, +}; +class SILoadStoreOptimizer : public MachineFunctionPass {    struct CombineInfo {      MachineBasicBlock::iterator I;      MachineBasicBlock::iterator Paired;      unsigned EltSize;      unsigned Offset0;      unsigned Offset1; +    unsigned Width0; +    unsigned Width1;      unsigned BaseOff;      InstClassEnum InstClass;      bool GLC0; @@ -98,9 +133,23 @@ class SILoadStoreOptimizer : public MachineFunctionPass {      bool SLC0;      bool SLC1;      bool UseST64; -    bool IsX2; -    SmallVector<MachineInstr*, 8> InstsToMove; -   }; +    SmallVector<MachineInstr *, 8> InstsToMove; +  }; + +  struct BaseRegisters { +    unsigned LoReg = 0; +    unsigned HiReg = 0; + +    unsigned LoSubReg = 0; +    unsigned HiSubReg = 0; +  }; + +  struct MemAddress { +    BaseRegisters Base; +    int64_t Offset = 0; +  }; + +  using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;  private:    const GCNSubtarget *STM = nullptr; @@ -108,9 +157,16 @@ private:    const SIRegisterInfo *TRI = nullptr;    MachineRegisterInfo *MRI = nullptr;    AliasAnalysis *AA = nullptr; -  unsigned CreatedX2; +  bool OptimizeAgain;    static bool offsetsCanBeCombined(CombineInfo &CI); +  static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI); +  static unsigned getNewOpcode(const CombineInfo &CI); +  static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI); +  const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI); +  unsigned getOpcodeWidth(const MachineInstr &MI); +  InstClassEnum getInstClass(unsigned Opc); +  unsigned getRegs(unsigned Opc);    bool findMatchingInst(CombineInfo &CI); @@ -123,10 +179,21 @@ private:    MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);    MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);    MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); -  unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2, -                                    bool &IsOffen) const;    MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI); +  void updateBaseAndOffset(MachineInstr &I, unsigned NewBase, +                           int32_t NewOffset); +  unsigned computeBase(MachineInstr &MI, const MemAddress &Addr); +  MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI); +  Optional<int32_t> extractConstOffset(const MachineOperand &Op); +  void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr); +  /// Promotes constant offset to the immediate by adjusting the base. It +  /// tries to use a base from the nearby instructions that allows it to have +  /// a 13bit constant offset which gets promoted to the immediate. +  bool promoteConstantOffsetToImm(MachineInstr &CI, +                                  MemInfoMap &Visited, +                                  SmallPtrSet<MachineInstr *, 4> &Promoted); +  public:    static char ID; @@ -153,8 +220,8 @@ public:  INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,                        "SI Load Store Optimizer", false, false)  INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, -                    "SI Load Store Optimizer", false, false) +INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", +                    false, false)  char SILoadStoreOptimizer::ID = 0; @@ -165,7 +232,7 @@ FunctionPass *llvm::createSILoadStoreOptimizerPass() {  }  static void moveInstsAfter(MachineBasicBlock::iterator I, -                           ArrayRef<MachineInstr*> InstsToMove) { +                           ArrayRef<MachineInstr *> InstsToMove) {    MachineBasicBlock *MBB = I->getParent();    ++I;    for (MachineInstr *MI : InstsToMove) { @@ -191,21 +258,19 @@ static void addDefsUsesToList(const MachineInstr &MI,  static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,                                        MachineBasicBlock::iterator B,                                        const SIInstrInfo *TII, -                                      AliasAnalysis * AA) { +                                      AliasAnalysis *AA) {    // RAW or WAR - cannot reorder    // WAW - cannot reorder    // RAR - safe to reorder    return !(A->mayStore() || B->mayStore()) || -    TII->areMemAccessesTriviallyDisjoint(*A, *B, AA); +         TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);  }  // Add MI and its defs to the lists if MI reads one of the defs that are  // already in the list. Returns true in that case. -static bool -addToListsIfDependent(MachineInstr &MI, -                      DenseSet<unsigned> &RegDefs, -                      DenseSet<unsigned> &PhysRegUses, -                      SmallVectorImpl<MachineInstr*> &Insts) { +static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs, +                                  DenseSet<unsigned> &PhysRegUses, +                                  SmallVectorImpl<MachineInstr *> &Insts) {    for (MachineOperand &Use : MI.operands()) {      // If one of the defs is read, then there is a use of Def between I and the      // instruction that I will potentially be merged with. We will need to move @@ -228,18 +293,16 @@ addToListsIfDependent(MachineInstr &MI,    return false;  } -static bool -canMoveInstsAcrossMemOp(MachineInstr &MemOp, -                        ArrayRef<MachineInstr*> InstsToMove, -                        const SIInstrInfo *TII, -                        AliasAnalysis *AA) { +static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, +                                    ArrayRef<MachineInstr *> InstsToMove, +                                    const SIInstrInfo *TII, AliasAnalysis *AA) {    assert(MemOp.mayLoadOrStore());    for (MachineInstr *InstToMove : InstsToMove) {      if (!InstToMove->mayLoadOrStore())        continue;      if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA)) -        return false; +      return false;    }    return true;  } @@ -260,10 +323,9 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {    CI.BaseOff = 0;    // Handle SMEM and VMEM instructions. -  if (CI.InstClass != DS_READ_WRITE) { -    unsigned Diff = CI.IsX2 ? 2 : 1; -    return (EltOffset0 + Diff == EltOffset1 || -            EltOffset1 + Diff == EltOffset0) && +  if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { +    return (EltOffset0 + CI.Width0 == EltOffset1 || +            EltOffset1 + CI.Width1 == EltOffset0) &&             CI.GLC0 == CI.GLC1 &&             (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);    } @@ -305,42 +367,176 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {    return false;  } +bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, +                                     const CombineInfo &CI) { +  const unsigned Width = (CI.Width0 + CI.Width1); +  switch (CI.InstClass) { +  default: +    return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); +  case S_BUFFER_LOAD_IMM: +    switch (Width) { +    default: +      return false; +    case 2: +    case 4: +      return true; +    } +  } +} + +unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) { +  const unsigned Opc = MI.getOpcode(); + +  if (TII->isMUBUF(MI)) { +    return AMDGPU::getMUBUFDwords(Opc); +  } + +  switch (Opc) { +  default: +    return 0; +  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: +    return 1; +  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: +    return 2; +  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: +    return 4; +  } +} + +InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) { +  if (TII->isMUBUF(Opc)) { +    const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc); + +    // If we couldn't identify the opcode, bail out. +    if (baseOpcode == -1) { +      return UNKNOWN; +    } + +    switch (baseOpcode) { +    default: +      return UNKNOWN; +    case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: +      return BUFFER_LOAD_OFFEN; +    case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: +      return BUFFER_LOAD_OFFSET; +    case AMDGPU::BUFFER_STORE_DWORD_OFFEN: +      return BUFFER_STORE_OFFEN; +    case AMDGPU::BUFFER_STORE_DWORD_OFFSET: +      return BUFFER_STORE_OFFSET; +    case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: +      return BUFFER_LOAD_OFFEN_exact; +    case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: +      return BUFFER_LOAD_OFFSET_exact; +    case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: +      return BUFFER_STORE_OFFEN_exact; +    case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: +      return BUFFER_STORE_OFFSET_exact; +    } +  } + +  switch (Opc) { +  default: +    return UNKNOWN; +  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: +  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: +  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: +    return S_BUFFER_LOAD_IMM; +  case AMDGPU::DS_READ_B32: +  case AMDGPU::DS_READ_B64: +  case AMDGPU::DS_READ_B32_gfx9: +  case AMDGPU::DS_READ_B64_gfx9: +    return DS_READ; +  case AMDGPU::DS_WRITE_B32: +  case AMDGPU::DS_WRITE_B64: +  case AMDGPU::DS_WRITE_B32_gfx9: +  case AMDGPU::DS_WRITE_B64_gfx9: +    return DS_WRITE; +  } +} + +unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) { +  if (TII->isMUBUF(Opc)) { +    unsigned result = 0; + +    if (AMDGPU::getMUBUFHasVAddr(Opc)) { +      result |= VADDR; +    } + +    if (AMDGPU::getMUBUFHasSrsrc(Opc)) { +      result |= SRSRC; +    } + +    if (AMDGPU::getMUBUFHasSoffset(Opc)) { +      result |= SOFFSET; +    } + +    return result; +  } + +  switch (Opc) { +  default: +    return 0; +  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: +  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: +  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: +    return SBASE; +  case AMDGPU::DS_READ_B32: +  case AMDGPU::DS_READ_B64: +  case AMDGPU::DS_READ_B32_gfx9: +  case AMDGPU::DS_READ_B64_gfx9: +  case AMDGPU::DS_WRITE_B32: +  case AMDGPU::DS_WRITE_B64: +  case AMDGPU::DS_WRITE_B32_gfx9: +  case AMDGPU::DS_WRITE_B64_gfx9: +    return ADDR; +  } +} +  bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {    MachineBasicBlock *MBB = CI.I->getParent();    MachineBasicBlock::iterator E = MBB->end();    MachineBasicBlock::iterator MBBI = CI.I; -  unsigned AddrOpName[3] = {0}; -  int AddrIdx[3]; -  const MachineOperand *AddrReg[3]; +  const unsigned Opc = CI.I->getOpcode(); +  const InstClassEnum InstClass = getInstClass(Opc); + +  if (InstClass == UNKNOWN) { +    return false; +  } + +  const unsigned Regs = getRegs(Opc); + +  unsigned AddrOpName[5] = {0}; +  int AddrIdx[5]; +  const MachineOperand *AddrReg[5];    unsigned NumAddresses = 0; -  switch (CI.InstClass) { -  case DS_READ_WRITE: +  if (Regs & ADDR) {      AddrOpName[NumAddresses++] = AMDGPU::OpName::addr; -    break; -  case S_BUFFER_LOAD_IMM: +  } + +  if (Regs & SBASE) {      AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase; -    break; -  case BUFFER_LOAD_OFFEN: -  case BUFFER_STORE_OFFEN: -    AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; -    AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; -    AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; -    break; -  case BUFFER_LOAD_OFFSET: -  case BUFFER_STORE_OFFSET: +  } + +  if (Regs & SRSRC) {      AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; +  } + +  if (Regs & SOFFSET) {      AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; -    break; +  } + +  if (Regs & VADDR) { +    AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;    }    for (unsigned i = 0; i < NumAddresses; i++) {      AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);      AddrReg[i] = &CI.I->getOperand(AddrIdx[i]); -    // We only ever merge operations with the same base address register, so don't -    // bother scanning forward if there are no other uses. +    // We only ever merge operations with the same base address register, so +    // don't bother scanning forward if there are no other uses.      if (AddrReg[i]->isReg() &&          (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) ||           MRI->hasOneNonDBGUse(AddrReg[i]->getReg()))) @@ -353,8 +549,11 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {    DenseSet<unsigned> PhysRegUsesToMove;    addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); -  for ( ; MBBI != E; ++MBBI) { -    if (MBBI->getOpcode() != CI.I->getOpcode()) { +  for (; MBBI != E; ++MBBI) { +    const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE); + +    if ((getInstClass(MBBI->getOpcode()) != InstClass) || +        (IsDS && (MBBI->getOpcode() != Opc))) {        // This is not a matching DS instruction, but we can keep looking as        // long as one of these conditions are met:        // 1. It is safe to move I down past MBBI. @@ -368,8 +567,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {        }        if (MBBI->mayLoadOrStore() && -        (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || -         !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) { +          (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || +           !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {          // We fail condition #1, but we may still be able to satisfy condition          // #2.  Add this instruction to the move list and then we will check          // if condition #2 holds once we have selected the matching instruction. @@ -413,8 +612,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {          continue;        } -      // Check same base pointer. Be careful of subregisters, which can occur with -      // vectors of pointers. +      // Check same base pointer. Be careful of subregisters, which can occur +      // with vectors of pointers.        if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||            AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {          Match = false; @@ -423,13 +622,15 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {      }      if (Match) { -      int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), -                                                 AMDGPU::OpName::offset); +      int OffsetIdx = +          AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset);        CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm(); +      CI.Width0 = getOpcodeWidth(*CI.I);        CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm(); +      CI.Width1 = getOpcodeWidth(*MBBI);        CI.Paired = MBBI; -      if (CI.InstClass == DS_READ_WRITE) { +      if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) {          CI.Offset0 &= 0xffff;          CI.Offset1 &= 0xffff;        } else { @@ -445,7 +646,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {        // We also need to go through the list of instructions that we plan to        // move and make sure they are all safe to move down past the merged        // instruction. -      if (offsetsCanBeCombined(CI)) +      if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI))          if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))            return true;      } @@ -472,12 +673,12 @@ unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {    if (STM->ldsRequiresM0Init())      return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; -  return (EltSize == 4) ? -    AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9; +  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 +                        : AMDGPU::DS_READ2ST64_B64_gfx9;  } -MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair( -  CombineInfo &CI) { +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {    MachineBasicBlock *MBB = CI.I->getParent();    // Be careful, since the addresses could be subregisters themselves in weird @@ -489,8 +690,8 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(    unsigned NewOffset0 = CI.Offset0;    unsigned NewOffset1 = CI.Offset1; -  unsigned Opc = CI.UseST64 ? -    read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); +  unsigned Opc = +      CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);    unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;    unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; @@ -502,39 +703,40 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(    }    assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && -         (NewOffset0 != NewOffset1) && -         "Computed offset doesn't fit"); +         (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");    const MCInstrDesc &Read2Desc = TII->get(Opc); -  const TargetRegisterClass *SuperRC -    = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; +  const TargetRegisterClass *SuperRC = +      (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;    unsigned DestReg = MRI->createVirtualRegister(SuperRC);    DebugLoc DL = CI.I->getDebugLoc();    unsigned BaseReg = AddrReg->getReg(); +  unsigned BaseSubReg = AddrReg->getSubReg();    unsigned BaseRegFlags = 0;    if (CI.BaseOff) {      unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);      BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) -      .addImm(CI.BaseOff); +        .addImm(CI.BaseOff);      BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);      BaseRegFlags = RegState::Kill;      TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) -      .addReg(ImmReg) -      .addReg(AddrReg->getReg()); +        .addReg(ImmReg) +        .addReg(AddrReg->getReg(), 0, BaseSubReg); +    BaseSubReg = 0;    }    MachineInstrBuilder Read2 = -    BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg) -      .addReg(BaseReg, BaseRegFlags) // addr -      .addImm(NewOffset0)            // offset0 -      .addImm(NewOffset1)            // offset1 -      .addImm(0)                     // gds -      .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); +      BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg) +          .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr +          .addImm(NewOffset0)                        // offset0 +          .addImm(NewOffset1)                        // offset1 +          .addImm(0)                                 // gds +          .cloneMergedMemRefs({&*CI.I, &*CI.Paired});    (void)Read2; @@ -561,32 +763,36 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(  unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {    if (STM->ldsRequiresM0Init())      return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; -  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9; +  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 +                        : AMDGPU::DS_WRITE2_B64_gfx9;  }  unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {    if (STM->ldsRequiresM0Init()) -    return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; +    return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 +                          : AMDGPU::DS_WRITE2ST64_B64; -  return (EltSize == 4) ? -    AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9; +  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 +                        : AMDGPU::DS_WRITE2ST64_B64_gfx9;  } -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( -  CombineInfo &CI) { +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {    MachineBasicBlock *MBB = CI.I->getParent();    // Be sure to use .addOperand(), and not .addReg() with these. We want to be    // sure we preserve the subregister index and any register flags set on them. -  const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); -  const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); -  const MachineOperand *Data1 -    = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0); +  const MachineOperand *AddrReg = +      TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); +  const MachineOperand *Data0 = +      TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); +  const MachineOperand *Data1 = +      TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);    unsigned NewOffset0 = CI.Offset0;    unsigned NewOffset1 = CI.Offset1; -  unsigned Opc = CI.UseST64 ? -    write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); +  unsigned Opc = +      CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);    if (NewOffset0 > NewOffset1) {      // Canonicalize the merged instruction so the smaller offset comes first. @@ -595,36 +801,37 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(    }    assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && -         (NewOffset0 != NewOffset1) && -         "Computed offset doesn't fit"); +         (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");    const MCInstrDesc &Write2Desc = TII->get(Opc);    DebugLoc DL = CI.I->getDebugLoc();    unsigned BaseReg = AddrReg->getReg(); +  unsigned BaseSubReg = AddrReg->getSubReg();    unsigned BaseRegFlags = 0;    if (CI.BaseOff) {      unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);      BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) -      .addImm(CI.BaseOff); +        .addImm(CI.BaseOff);      BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);      BaseRegFlags = RegState::Kill;      TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) -      .addReg(ImmReg) -      .addReg(AddrReg->getReg()); +        .addReg(ImmReg) +        .addReg(AddrReg->getReg(), 0, BaseSubReg); +    BaseSubReg = 0;    }    MachineInstrBuilder Write2 = -    BuildMI(*MBB, CI.Paired, DL, Write2Desc) -      .addReg(BaseReg, BaseRegFlags) // addr -      .add(*Data0)                   // data0 -      .add(*Data1)                   // data1 -      .addImm(NewOffset0)            // offset0 -      .addImm(NewOffset1)            // offset1 -      .addImm(0)                     // gds -      .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); +      BuildMI(*MBB, CI.Paired, DL, Write2Desc) +          .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr +          .add(*Data0)                               // data0 +          .add(*Data1)                               // data1 +          .addImm(NewOffset0)                        // offset0 +          .addImm(NewOffset1)                        // offset1 +          .addImm(0)                                 // gds +          .cloneMergedMemRefs({&*CI.I, &*CI.Paired});    moveInstsAfter(Write2, CI.InstsToMove); @@ -636,15 +843,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(    return Next;  } -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( -  CombineInfo &CI) { +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {    MachineBasicBlock *MBB = CI.I->getParent();    DebugLoc DL = CI.I->getDebugLoc(); -  unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM : -                              AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; +  const unsigned Opcode = getNewOpcode(CI); + +  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); -  const TargetRegisterClass *SuperRC = -    CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass;    unsigned DestReg = MRI->createVirtualRegister(SuperRC);    unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); @@ -652,14 +858,11 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))        .addImm(MergedOffset) // offset        .addImm(CI.GLC0)      // glc -      .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); - -  unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; -  unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; +      .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); -  // Handle descending offsets -  if (CI.Offset0 > CI.Offset1) -    std::swap(SubRegIdx0, SubRegIdx1); +  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); +  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); +  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);    // Copy to the old destination registers.    const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); @@ -681,29 +884,25 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(    return Next;  } -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( -  CombineInfo &CI) { +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {    MachineBasicBlock *MBB = CI.I->getParent();    DebugLoc DL = CI.I->getDebugLoc(); -  unsigned Opcode; -  if (CI.InstClass == BUFFER_LOAD_OFFEN) { -    Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN : -                       AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; -  } else { -    Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET : -                       AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; -  } +  const unsigned Opcode = getNewOpcode(CI); -  const TargetRegisterClass *SuperRC = -    CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; +  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); + +  // Copy to the new source register.    unsigned DestReg = MRI->createVirtualRegister(SuperRC);    unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);    auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg); -  if (CI.InstClass == BUFFER_LOAD_OFFEN) -      MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); +  const unsigned Regs = getRegs(Opcode); + +  if (Regs & VADDR) +    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) @@ -711,14 +910,11 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(        .addImm(CI.GLC0)      // glc        .addImm(CI.SLC0)      // slc        .addImm(0)            // tfe -      .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); - -  unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; -  unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; +      .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); -  // Handle descending offsets -  if (CI.Offset0 > CI.Offset1) -    std::swap(SubRegIdx0, SubRegIdx1); +  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); +  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); +  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);    // Copy to the old destination registers.    const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); @@ -740,57 +936,137 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(    return Next;  } -unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode( -  const MachineInstr &I, bool &IsX2, bool &IsOffen) const { -  IsX2 = false; -  IsOffen = false; - -  switch (I.getOpcode()) { -  case AMDGPU::BUFFER_STORE_DWORD_OFFEN: -    IsOffen = true; -    return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN; -  case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: -    IsOffen = true; -    return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact; -  case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: -    IsX2 = true; -    IsOffen = true; -    return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN; -  case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact: -    IsX2 = true; -    IsOffen = true; -    return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact; -  case AMDGPU::BUFFER_STORE_DWORD_OFFSET: -    return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; -  case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: -    return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact; -  case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: -    IsX2 = true; -    return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; -  case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact: -    IsX2 = true; -    return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact; -  } -  return 0; -} - -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( -  CombineInfo &CI) { +unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) { +  const unsigned Width = CI.Width0 + CI.Width1; + +  switch (CI.InstClass) { +  default: +    return AMDGPU::getMUBUFOpcode(CI.InstClass, Width); +  case UNKNOWN: +    llvm_unreachable("Unknown instruction class"); +  case S_BUFFER_LOAD_IMM: +    switch (Width) { +    default: +      return 0; +    case 2: +      return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; +    case 4: +      return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; +    } +  } +} + +std::pair<unsigned, unsigned> +SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) { +  if (CI.Offset0 > CI.Offset1) { +    switch (CI.Width0) { +    default: +      return std::make_pair(0, 0); +    case 1: +      switch (CI.Width1) { +      default: +        return std::make_pair(0, 0); +      case 1: +        return std::make_pair(AMDGPU::sub1, AMDGPU::sub0); +      case 2: +        return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1); +      case 3: +        return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2); +      } +    case 2: +      switch (CI.Width1) { +      default: +        return std::make_pair(0, 0); +      case 1: +        return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0); +      case 2: +        return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1); +      } +    case 3: +      switch (CI.Width1) { +      default: +        return std::make_pair(0, 0); +      case 1: +        return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0); +      } +    } +  } else { +    switch (CI.Width0) { +    default: +      return std::make_pair(0, 0); +    case 1: +      switch (CI.Width1) { +      default: +        return std::make_pair(0, 0); +      case 1: +        return std::make_pair(AMDGPU::sub0, AMDGPU::sub1); +      case 2: +        return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2); +      case 3: +        return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3); +      } +    case 2: +      switch (CI.Width1) { +      default: +        return std::make_pair(0, 0); +      case 1: +        return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2); +      case 2: +        return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3); +      } +    case 3: +      switch (CI.Width1) { +      default: +        return std::make_pair(0, 0); +      case 1: +        return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3); +      } +    } +  } +} + +const TargetRegisterClass * +SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) { +  if (CI.InstClass == S_BUFFER_LOAD_IMM) { +    switch (CI.Width0 + CI.Width1) { +    default: +      return nullptr; +    case 2: +      return &AMDGPU::SReg_64_XEXECRegClass; +    case 4: +      return &AMDGPU::SReg_128RegClass; +    case 8: +      return &AMDGPU::SReg_256RegClass; +    case 16: +      return &AMDGPU::SReg_512RegClass; +    } +  } else { +    switch (CI.Width0 + CI.Width1) { +    default: +      return nullptr; +    case 2: +      return &AMDGPU::VReg_64RegClass; +    case 3: +      return &AMDGPU::VReg_96RegClass; +    case 4: +      return &AMDGPU::VReg_128RegClass; +    } +  } +} + +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {    MachineBasicBlock *MBB = CI.I->getParent();    DebugLoc DL = CI.I->getDebugLoc(); -  bool Unused1, Unused2; -  unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2); -  unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; -  unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; +  const unsigned Opcode = getNewOpcode(CI); -  // Handle descending offsets -  if (CI.Offset0 > CI.Offset1) -    std::swap(SubRegIdx0, SubRegIdx1); +  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); +  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); +  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);    // Copy to the new source register. -  const TargetRegisterClass *SuperRC = -    CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; +  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);    unsigned SrcReg = MRI->createVirtualRegister(SuperRC);    const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); @@ -803,18 +1079,20 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(        .addImm(SubRegIdx1);    auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode)) -      .addReg(SrcReg, RegState::Kill); +                 .addReg(SrcReg, RegState::Kill); -  if (CI.InstClass == BUFFER_STORE_OFFEN) -      MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); +  const unsigned Regs = getRegs(Opcode); + +  if (Regs & VADDR) +    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))        .addImm(std::min(CI.Offset0, CI.Offset1)) // offset -      .addImm(CI.GLC0)      // glc -      .addImm(CI.SLC0)      // slc -      .addImm(0)            // tfe -      .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); +      .addImm(CI.GLC0)                          // glc +      .addImm(CI.SLC0)                          // slc +      .addImm(0)                                // tfe +      .cloneMergedMemRefs({&*CI.I, &*CI.Paired});    moveInstsAfter(MIB, CI.InstsToMove); @@ -824,105 +1102,399 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(    return Next;  } +MachineOperand +SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) { +  APInt V(32, Val, true); +  if (TII->isInlineConstant(V)) +    return MachineOperand::CreateImm(Val); + +  unsigned Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); +  MachineInstr *Mov = +  BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), +          TII->get(AMDGPU::S_MOV_B32), Reg) +    .addImm(Val); +  (void)Mov; +  LLVM_DEBUG(dbgs() << "    "; Mov->dump()); +  return MachineOperand::CreateReg(Reg, false); +} + +// Compute base address using Addr and return the final register. +unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI, +                                           const MemAddress &Addr) { +  MachineBasicBlock *MBB = MI.getParent(); +  MachineBasicBlock::iterator MBBI = MI.getIterator(); +  DebugLoc DL = MI.getDebugLoc(); + +  assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || +          Addr.Base.LoSubReg) && +         "Expected 32-bit Base-Register-Low!!"); + +  assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || +          Addr.Base.HiSubReg) && +         "Expected 32-bit Base-Register-Hi!!"); + +  LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n"); +  MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); +  MachineOperand OffsetHi = +    createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); +  unsigned CarryReg = MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); +  unsigned DeadCarryReg = +    MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + +  unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); +  unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); +  MachineInstr *LoHalf = +    BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0) +      .addReg(CarryReg, RegState::Define) +      .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) +    .add(OffsetLo); +  (void)LoHalf; +  LLVM_DEBUG(dbgs() << "    "; LoHalf->dump();); + +  MachineInstr *HiHalf = +  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) +    .addReg(DeadCarryReg, RegState::Define | RegState::Dead) +    .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) +    .add(OffsetHi) +    .addReg(CarryReg, RegState::Kill); +  (void)HiHalf; +  LLVM_DEBUG(dbgs() << "    "; HiHalf->dump();); + +  unsigned FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass); +  MachineInstr *FullBase = +    BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) +      .addReg(DestSub0) +      .addImm(AMDGPU::sub0) +      .addReg(DestSub1) +      .addImm(AMDGPU::sub1); +  (void)FullBase; +  LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";); + +  return FullDestReg; +} + +// Update base and offset with the NewBase and NewOffset in MI. +void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, +                                               unsigned NewBase, +                                               int32_t NewOffset) { +  TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase); +  TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); +} + +Optional<int32_t> +SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) { +  if (Op.isImm()) +    return Op.getImm(); + +  if (!Op.isReg()) +    return None; + +  MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); +  if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || +      !Def->getOperand(1).isImm()) +    return None; + +  return Def->getOperand(1).getImm(); +} + +// Analyze Base and extracts: +//  - 32bit base registers, subregisters +//  - 64bit constant offset +// Expecting base computation as: +//   %OFFSET0:sgpr_32 = S_MOV_B32 8000 +//   %LO:vgpr_32, %c:sreg_64_xexec = +//       V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, +//   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec +//   %Base:vreg_64 = +//       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 +void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, +                                                      MemAddress &Addr) { +  if (!Base.isReg()) +    return; + +  MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); +  if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE +      || Def->getNumOperands() != 5) +    return; + +  MachineOperand BaseLo = Def->getOperand(1); +  MachineOperand BaseHi = Def->getOperand(3); +  if (!BaseLo.isReg() || !BaseHi.isReg()) +    return; + +  MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); +  MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); + +  if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 || +      !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) +    return; + +  const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); +  const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); + +  auto Offset0P = extractConstOffset(*Src0); +  if (Offset0P) +    BaseLo = *Src1; +  else { +    if (!(Offset0P = extractConstOffset(*Src1))) +      return; +    BaseLo = *Src0; +  } + +  Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); +  Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); + +  if (Src0->isImm()) +    std::swap(Src0, Src1); + +  if (!Src1->isImm()) +    return; + +  uint64_t Offset1 = Src1->getImm(); +  BaseHi = *Src0; + +  Addr.Base.LoReg = BaseLo.getReg(); +  Addr.Base.HiReg = BaseHi.getReg(); +  Addr.Base.LoSubReg = BaseLo.getSubReg(); +  Addr.Base.HiSubReg = BaseHi.getSubReg(); +  Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); +} + +bool SILoadStoreOptimizer::promoteConstantOffsetToImm( +    MachineInstr &MI, +    MemInfoMap &Visited, +    SmallPtrSet<MachineInstr *, 4> &AnchorList) { + +  // TODO: Support flat and scratch. +  if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0 || +      TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL) +    return false; + +  // TODO: Support Store. +  if (!MI.mayLoad()) +    return false; + +  if (AnchorList.count(&MI)) +    return false; + +  LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); + +  if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { +    LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";); +    return false; +  } + +  // Step1: Find the base-registers and a 64bit constant offset. +  MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); +  MemAddress MAddr; +  if (Visited.find(&MI) == Visited.end()) { +    processBaseWithConstOffset(Base, MAddr); +    Visited[&MI] = MAddr; +  } else +    MAddr = Visited[&MI]; + +  if (MAddr.Offset == 0) { +    LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no" +                         " constant offsets that can be promoted.\n";); +    return false; +  } + +  LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", " +             << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); + +  // Step2: Traverse through MI's basic block and find an anchor(that has the +  // same base-registers) with the highest 13bit distance from MI's offset. +  // E.g. (64bit loads) +  // bb: +  //   addr1 = &a + 4096;   load1 = load(addr1,  0) +  //   addr2 = &a + 6144;   load2 = load(addr2,  0) +  //   addr3 = &a + 8192;   load3 = load(addr3,  0) +  //   addr4 = &a + 10240;  load4 = load(addr4,  0) +  //   addr5 = &a + 12288;  load5 = load(addr5,  0) +  // +  // Starting from the first load, the optimization will try to find a new base +  // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 +  // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 +  // as the new-base(anchor) because of the maximum distance which can +  // accomodate more intermediate bases presumeably. +  // +  // Step3: move (&a + 8192) above load1. Compute and promote offsets from +  // (&a + 8192) for load1, load2, load4. +  //   addr = &a + 8192 +  //   load1 = load(addr,       -4096) +  //   load2 = load(addr,       -2048) +  //   load3 = load(addr,       0) +  //   load4 = load(addr,       2048) +  //   addr5 = &a + 12288;  load5 = load(addr5,  0) +  // +  MachineInstr *AnchorInst = nullptr; +  MemAddress AnchorAddr; +  uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); +  SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; + +  MachineBasicBlock *MBB = MI.getParent(); +  MachineBasicBlock::iterator E = MBB->end(); +  MachineBasicBlock::iterator MBBI = MI.getIterator(); +  ++MBBI; +  const SITargetLowering *TLI = +    static_cast<const SITargetLowering *>(STM->getTargetLowering()); + +  for ( ; MBBI != E; ++MBBI) { +    MachineInstr &MINext = *MBBI; +    // TODO: Support finding an anchor(with same base) from store addresses or +    // any other load addresses where the opcodes are different. +    if (MINext.getOpcode() != MI.getOpcode() || +        TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) +      continue; + +    const MachineOperand &BaseNext = +      *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); +    MemAddress MAddrNext; +    if (Visited.find(&MINext) == Visited.end()) { +      processBaseWithConstOffset(BaseNext, MAddrNext); +      Visited[&MINext] = MAddrNext; +    } else +      MAddrNext = Visited[&MINext]; + +    if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || +        MAddrNext.Base.HiReg != MAddr.Base.HiReg || +        MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || +        MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) +      continue; + +    InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset)); + +    int64_t Dist = MAddr.Offset - MAddrNext.Offset; +    TargetLoweringBase::AddrMode AM; +    AM.HasBaseReg = true; +    AM.BaseOffs = Dist; +    if (TLI->isLegalGlobalAddressingMode(AM) && +        (uint32_t)std::abs(Dist) > MaxDist) { +      MaxDist = std::abs(Dist); + +      AnchorAddr = MAddrNext; +      AnchorInst = &MINext; +    } +  } + +  if (AnchorInst) { +    LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): "; +               AnchorInst->dump()); +    LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: " +               <<  AnchorAddr.Offset << "\n\n"); + +    // Instead of moving up, just re-compute anchor-instruction's base address. +    unsigned Base = computeBase(MI, AnchorAddr); + +    updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); +    LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump();); + +    for (auto P : InstsWCommonBase) { +      TargetLoweringBase::AddrMode AM; +      AM.HasBaseReg = true; +      AM.BaseOffs = P.second - AnchorAddr.Offset; + +      if (TLI->isLegalGlobalAddressingMode(AM)) { +        LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second; +                   dbgs() << ")"; P.first->dump()); +        updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); +        LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump()); +      } +    } +    AnchorList.insert(AnchorInst); +    return true; +  } + +  return false; +} +  // Scan through looking for adjacent LDS operations with constant offsets from  // the same base register. We rely on the scheduler to do the hard work of  // clustering nearby loads, and assume these are all adjacent.  bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {    bool Modified = false; +  // Contain the list +  MemInfoMap Visited; +  // Contains the list of instructions for which constant offsets are being +  // promoted to the IMM. +  SmallPtrSet<MachineInstr *, 4> AnchorList; +    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {      MachineInstr &MI = *I; +    if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) +      Modified = true; +      // Don't combine if volatile.      if (MI.hasOrderedMemoryRef()) {        ++I;        continue;      } +    const unsigned Opc = MI.getOpcode(); +      CombineInfo CI;      CI.I = I; -    unsigned Opc = MI.getOpcode(); -    if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 || -        Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) { +    CI.InstClass = getInstClass(Opc); -      CI.InstClass = DS_READ_WRITE; +    switch (CI.InstClass) { +    default: +      break; +    case DS_READ:        CI.EltSize = -        (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4; - +          (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 +                                                                          : 4;        if (findMatchingInst(CI)) {          Modified = true;          I = mergeRead2Pair(CI);        } else {          ++I;        } -        continue; -    } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 || -               Opc == AMDGPU::DS_WRITE_B32_gfx9 || -               Opc == AMDGPU::DS_WRITE_B64_gfx9) { -      CI.InstClass = DS_READ_WRITE; -      CI.EltSize -        = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4; - +    case DS_WRITE: +      CI.EltSize = +          (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 +                                                                            : 4;        if (findMatchingInst(CI)) {          Modified = true;          I = mergeWrite2Pair(CI);        } else {          ++I;        } -        continue; -    } -    if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM || -        Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) { -      // EltSize is in units of the offset encoding. -      CI.InstClass = S_BUFFER_LOAD_IMM; +    case S_BUFFER_LOAD_IMM:        CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4); -      CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;        if (findMatchingInst(CI)) {          Modified = true;          I = mergeSBufferLoadImmPair(CI); -        if (!CI.IsX2) -          CreatedX2++; +        OptimizeAgain |= (CI.Width0 + CI.Width1) < 16;        } else {          ++I;        }        continue; -    } -    if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN || -        Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN || -        Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET || -        Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) { -      if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN || -          Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN) -        CI.InstClass = BUFFER_LOAD_OFFEN; -      else -        CI.InstClass = BUFFER_LOAD_OFFSET; - +    case BUFFER_LOAD_OFFEN: +    case BUFFER_LOAD_OFFSET: +    case BUFFER_LOAD_OFFEN_exact: +    case BUFFER_LOAD_OFFSET_exact:        CI.EltSize = 4; -      CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN || -                Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;        if (findMatchingInst(CI)) {          Modified = true;          I = mergeBufferLoadPair(CI); -        if (!CI.IsX2) -          CreatedX2++; +        OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;        } else {          ++I;        }        continue; -    } - -    bool StoreIsX2, IsOffen; -    if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) { -      CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET; +    case BUFFER_STORE_OFFEN: +    case BUFFER_STORE_OFFSET: +    case BUFFER_STORE_OFFEN_exact: +    case BUFFER_STORE_OFFSET_exact:        CI.EltSize = 4; -      CI.IsX2 = StoreIsX2;        if (findMatchingInst(CI)) {          Modified = true;          I = mergeBufferStorePair(CI); -        if (!CI.IsX2) -          CreatedX2++; +        OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;        } else {          ++I;        } @@ -956,12 +1528,10 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {    bool Modified = false;    for (MachineBasicBlock &MBB : MF) { -    CreatedX2 = 0; -    Modified |= optimizeBlock(MBB); - -    // Run again to convert x2 to x4. -    if (CreatedX2 >= 1) +    do { +      OptimizeAgain = false;        Modified |= optimizeBlock(MBB); +    } while (OptimizeAgain);    }    return Modified;  | 
