diff options
Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp')
| -rw-r--r-- | contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 532 | 
1 files changed, 463 insertions, 69 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index c6ad61a325cc..84cd47a101a8 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -1,4 +1,4 @@ -//===-- SILoadStoreOptimizer.cpp ------------------------------------------===// +//===- SILoadStoreOptimizer.cpp -------------------------------------------===//  //  //                     The LLVM Compiler Infrastructure  // @@ -14,6 +14,12 @@  // ==>  //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8  // +// The same is done for certain SMEM and VMEM opcodes, e.g.: +//  s_buffer_load_dword s4, s[0:3], 4 +//  s_buffer_load_dword s5, s[0:3], 8 +// ==> +//  s_buffer_load_dwordx2 s[4:5], s[0:3], 4 +//  //  // Future improvements:  // @@ -56,8 +62,9 @@  #include "llvm/Support/Debug.h"  #include "llvm/Support/MathExtras.h"  #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" +#include <algorithm>  #include <cassert> +#include <cstdlib>  #include <iterator>  #include <utility> @@ -68,31 +75,56 @@ using namespace llvm;  namespace {  class SILoadStoreOptimizer : public MachineFunctionPass { - -  typedef struct { +  enum InstClassEnum { +    DS_READ_WRITE, +    S_BUFFER_LOAD_IMM, +    BUFFER_LOAD_OFFEN, +    BUFFER_LOAD_OFFSET, +    BUFFER_STORE_OFFEN, +    BUFFER_STORE_OFFSET, +  }; + +  struct CombineInfo {      MachineBasicBlock::iterator I;      MachineBasicBlock::iterator Paired;      unsigned EltSize;      unsigned Offset0;      unsigned Offset1;      unsigned BaseOff; +    InstClassEnum InstClass; +    bool GLC0; +    bool GLC1; +    bool SLC0; +    bool SLC1;      bool UseST64; +    bool IsX2;      SmallVector<MachineInstr*, 8> InstsToMove; -   } CombineInfo; +   };  private: +  const SISubtarget *STM = nullptr;    const SIInstrInfo *TII = nullptr;    const SIRegisterInfo *TRI = nullptr;    MachineRegisterInfo *MRI = nullptr;    AliasAnalysis *AA = nullptr; +  unsigned CreatedX2;    static bool offsetsCanBeCombined(CombineInfo &CI); -  bool findMatchingDSInst(CombineInfo &CI); +  bool findMatchingInst(CombineInfo &CI); +  unsigned read2Opcode(unsigned EltSize) const; +  unsigned read2ST64Opcode(unsigned EltSize) const;    MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI); +  unsigned write2Opcode(unsigned EltSize) const; +  unsigned write2ST64Opcode(unsigned EltSize) const;    MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); +  MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); +  MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); +  unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2, +                                    bool &IsOffen) const; +  MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);  public:    static char ID; @@ -141,36 +173,35 @@ static void moveInstsAfter(MachineBasicBlock::iterator I,    }  } -static void addDefsToList(const MachineInstr &MI, -                          SmallVectorImpl<const MachineOperand *> &Defs) { -  for (const MachineOperand &Def : MI.defs()) { -    Defs.push_back(&Def); -  } +static void addDefsToList(const MachineInstr &MI, DenseSet<unsigned> &Defs) { +  // XXX: Should this be looking for implicit defs? +  for (const MachineOperand &Def : MI.defs()) +    Defs.insert(Def.getReg());  }  static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,                                        MachineBasicBlock::iterator B,                                        const SIInstrInfo *TII,                                        AliasAnalysis * AA) { -  return (TII->areMemAccessesTriviallyDisjoint(*A, *B, AA) || -    // RAW or WAR - cannot reorder -    // WAW - cannot reorder -    // RAR - safe to reorder -    !(A->mayStore() || B->mayStore())); +  // RAW or WAR - cannot reorder +  // WAW - cannot reorder +  // RAR - safe to reorder +  return !(A->mayStore() || B->mayStore()) || +    TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);  }  // Add MI and its defs to the lists if MI reads one of the defs that are  // already in the list. Returns true in that case.  static bool  addToListsIfDependent(MachineInstr &MI, -                      SmallVectorImpl<const MachineOperand *> &Defs, +                      DenseSet<unsigned> &Defs,                        SmallVectorImpl<MachineInstr*> &Insts) { -  for (const MachineOperand *Def : Defs) { -    bool ReadDef = MI.readsVirtualRegister(Def->getReg()); -    // If ReadDef is true, then there is a use of Def between I -    // and the instruction that I will potentially be merged with. We -    // will need to move this instruction after the merged instructions. -    if (ReadDef) { +  for (MachineOperand &Use : MI.operands()) { +    // If one of the defs is read, then there is a use of Def between I and the +    // instruction that I will potentially be merged with. We will need to move +    // this instruction after the merged instructions. + +    if (Use.isReg() && Use.readsReg() && Defs.count(Use.getReg())) {        Insts.push_back(&MI);        addDefsToList(MI, Defs);        return true; @@ -211,6 +242,15 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {    CI.UseST64 = false;    CI.BaseOff = 0; +  // Handle SMEM and VMEM instructions. +  if (CI.InstClass != DS_READ_WRITE) { +    unsigned Diff = CI.IsX2 ? 2 : 1; +    return (EltOffset0 + Diff == EltOffset1 || +            EltOffset1 + Diff == EltOffset0) && +           CI.GLC0 == CI.GLC1 && +           (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1); +  } +    // If the offset in elements doesn't fit in 8-bits, we might be able to use    // the stride 64 versions.    if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && @@ -248,30 +288,70 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {    return false;  } -bool SILoadStoreOptimizer::findMatchingDSInst(CombineInfo &CI) { -  MachineBasicBlock::iterator E = CI.I->getParent()->end(); +bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { +  MachineBasicBlock *MBB = CI.I->getParent(); +  MachineBasicBlock::iterator E = MBB->end();    MachineBasicBlock::iterator MBBI = CI.I; + +  unsigned AddrOpName[3] = {0}; +  int AddrIdx[3]; +  const MachineOperand *AddrReg[3]; +  unsigned NumAddresses = 0; + +  switch (CI.InstClass) { +  case DS_READ_WRITE: +    AddrOpName[NumAddresses++] = AMDGPU::OpName::addr; +    break; +  case S_BUFFER_LOAD_IMM: +    AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase; +    break; +  case BUFFER_LOAD_OFFEN: +  case BUFFER_STORE_OFFEN: +    AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; +    AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; +    AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; +    break; +  case BUFFER_LOAD_OFFSET: +  case BUFFER_STORE_OFFSET: +    AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; +    AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; +    break; +  } + +  for (unsigned i = 0; i < NumAddresses; i++) { +    AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]); +    AddrReg[i] = &CI.I->getOperand(AddrIdx[i]); + +    // We only ever merge operations with the same base address register, so don't +    // bother scanning forward if there are no other uses. +    if (AddrReg[i]->isReg() && +        (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) || +         MRI->hasOneNonDBGUse(AddrReg[i]->getReg()))) +      return false; +  } +    ++MBBI; -  SmallVector<const MachineOperand *, 8> DefsToMove; +  DenseSet<unsigned> DefsToMove;    addDefsToList(*CI.I, DefsToMove);    for ( ; MBBI != E; ++MBBI) {      if (MBBI->getOpcode() != CI.I->getOpcode()) { -        // This is not a matching DS instruction, but we can keep looking as        // long as one of these conditions are met:        // 1. It is safe to move I down past MBBI.        // 2. It is safe to move MBBI down past the instruction that I will        //    be merged into. -      if (MBBI->hasUnmodeledSideEffects()) +      if (MBBI->hasUnmodeledSideEffects()) {          // We can't re-order this instruction with respect to other memory -        // opeations, so we fail both conditions mentioned above. +        // operations, so we fail both conditions mentioned above.          return false; +      }        if (MBBI->mayLoadOrStore() && -        !memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA)) { +        (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || +         !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {          // We fail condition #1, but we may still be able to satisfy condition          // #2.  Add this instruction to the move list and then we will check          // if condition #2 holds once we have selected the matching instruction. @@ -300,21 +380,47 @@ bool SILoadStoreOptimizer::findMatchingDSInst(CombineInfo &CI) {      if (addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove))        continue; -    int AddrIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), -                                             AMDGPU::OpName::addr); -    const MachineOperand &AddrReg0 = CI.I->getOperand(AddrIdx); -    const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx); +    bool Match = true; +    for (unsigned i = 0; i < NumAddresses; i++) { +      const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]); + +      if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { +        if (AddrReg[i]->isImm() != AddrRegNext.isImm() || +            AddrReg[i]->getImm() != AddrRegNext.getImm()) { +          Match = false; +          break; +        } +        continue; +      } + +      // Check same base pointer. Be careful of subregisters, which can occur with +      // vectors of pointers. +      if (AddrReg[i]->getReg() != AddrRegNext.getReg() || +          AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { +        Match = false; +        break; +      } +    } -    // Check same base pointer. Be careful of subregisters, which can occur with -    // vectors of pointers. -    if (AddrReg0.getReg() == AddrReg1.getReg() && -        AddrReg0.getSubReg() == AddrReg1.getSubReg()) { +    if (Match) {        int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),                                                   AMDGPU::OpName::offset); -      CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm() & 0xffff; -      CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; +      CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm(); +      CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();        CI.Paired = MBBI; +      if (CI.InstClass == DS_READ_WRITE) { +        CI.Offset0 &= 0xffff; +        CI.Offset1 &= 0xffff; +      } else { +        CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm(); +        CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm(); +        if (CI.InstClass != S_BUFFER_LOAD_IMM) { +          CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm(); +          CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm(); +        } +      } +        // Check both offsets fit in the reduced range.        // We also need to go through the list of instructions that we plan to        // move and make sure they are all safe to move down past the merged @@ -336,6 +442,20 @@ bool SILoadStoreOptimizer::findMatchingDSInst(CombineInfo &CI) {    return false;  } +unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { +  if (STM->ldsRequiresM0Init()) +    return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; +  return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; +} + +unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { +  if (STM->ldsRequiresM0Init()) +    return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; + +  return (EltSize == 4) ? +    AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9; +} +  MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(    CombineInfo &CI) {    MachineBasicBlock *MBB = CI.I->getParent(); @@ -349,12 +469,8 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(    unsigned NewOffset0 = CI.Offset0;    unsigned NewOffset1 = CI.Offset1; -  unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2_B32 -                                   : AMDGPU::DS_READ2_B64; - -  if (CI.UseST64) -    Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 -                            : AMDGPU::DS_READ2ST64_B64; +  unsigned Opc = CI.UseST64 ? +    read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);    unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;    unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; @@ -382,9 +498,12 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(    if (CI.BaseOff) {      BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);      BaseRegFlags = RegState::Kill; -    BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg) -           .addImm(CI.BaseOff) -           .addReg(AddrReg->getReg()); + +    unsigned AddOpc = STM->hasAddNoCarry() ? +      AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32; +    BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg) +      .addImm(CI.BaseOff) +      .addReg(AddrReg->getReg());    }    MachineInstrBuilder Read2 = @@ -417,6 +536,20 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(    return Next;  } +unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { +  if (STM->ldsRequiresM0Init()) +    return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; +  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9; +} + +unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { +  if (STM->ldsRequiresM0Init()) +    return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; + +  return (EltSize == 4) ? +    AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9; +} +  MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(    CombineInfo &CI) {    MachineBasicBlock *MBB = CI.I->getParent(); @@ -430,12 +563,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(    unsigned NewOffset0 = CI.Offset0;    unsigned NewOffset1 = CI.Offset1; -  unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2_B32 -                                   : AMDGPU::DS_WRITE2_B64; - -  if (CI.UseST64) -    Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 -                            : AMDGPU::DS_WRITE2ST64_B64; +  unsigned Opc = CI.UseST64 ? +    write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);    if (NewOffset0 > NewOffset1) {      // Canonicalize the merged instruction so the smaller offset comes first. @@ -455,9 +584,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(    if (CI.BaseOff) {      BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);      BaseRegFlags = RegState::Kill; -    BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg) -           .addImm(CI.BaseOff) -           .addReg(Addr->getReg()); + +    unsigned AddOpc = STM->hasAddNoCarry() ? +      AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32; +    BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg) +      .addImm(CI.BaseOff) +      .addReg(Addr->getReg());    }    MachineInstrBuilder Write2 = @@ -480,6 +612,194 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(    return Next;  } +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( +  CombineInfo &CI) { +  MachineBasicBlock *MBB = CI.I->getParent(); +  DebugLoc DL = CI.I->getDebugLoc(); +  unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM : +                              AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; + +  const TargetRegisterClass *SuperRC = +    CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass; +  unsigned DestReg = MRI->createVirtualRegister(SuperRC); +  unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); + +  BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg) +      .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) +      .addImm(MergedOffset) // offset +      .addImm(CI.GLC0)      // glc +      .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); + +  unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; +  unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; + +  // Handle descending offsets +  if (CI.Offset0 > CI.Offset1) +    std::swap(SubRegIdx0, SubRegIdx1); + +  // Copy to the old destination registers. +  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); +  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); +  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst); + +  BuildMI(*MBB, CI.Paired, DL, CopyDesc) +      .add(*Dest0) // Copy to same destination including flags and sub reg. +      .addReg(DestReg, 0, SubRegIdx0); +  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) +                            .add(*Dest1) +                            .addReg(DestReg, RegState::Kill, SubRegIdx1); + +  moveInstsAfter(Copy1, CI.InstsToMove); + +  MachineBasicBlock::iterator Next = std::next(CI.I); +  CI.I->eraseFromParent(); +  CI.Paired->eraseFromParent(); +  return Next; +} + +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( +  CombineInfo &CI) { +  MachineBasicBlock *MBB = CI.I->getParent(); +  DebugLoc DL = CI.I->getDebugLoc(); +  unsigned Opcode; + +  if (CI.InstClass == BUFFER_LOAD_OFFEN) { +    Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN : +                       AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; +  } else { +    Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET : +                       AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; +  } + +  const TargetRegisterClass *SuperRC = +    CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; +  unsigned DestReg = MRI->createVirtualRegister(SuperRC); +  unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); + +  auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg); + +  if (CI.InstClass == BUFFER_LOAD_OFFEN) +      MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); + +  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) +      .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) +      .addImm(MergedOffset) // offset +      .addImm(CI.GLC0)      // glc +      .addImm(CI.SLC0)      // slc +      .addImm(0)            // tfe +      .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); + +  unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; +  unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; + +  // Handle descending offsets +  if (CI.Offset0 > CI.Offset1) +    std::swap(SubRegIdx0, SubRegIdx1); + +  // Copy to the old destination registers. +  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); +  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); +  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); + +  BuildMI(*MBB, CI.Paired, DL, CopyDesc) +      .add(*Dest0) // Copy to same destination including flags and sub reg. +      .addReg(DestReg, 0, SubRegIdx0); +  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) +                            .add(*Dest1) +                            .addReg(DestReg, RegState::Kill, SubRegIdx1); + +  moveInstsAfter(Copy1, CI.InstsToMove); + +  MachineBasicBlock::iterator Next = std::next(CI.I); +  CI.I->eraseFromParent(); +  CI.Paired->eraseFromParent(); +  return Next; +} + +unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode( +  const MachineInstr &I, bool &IsX2, bool &IsOffen) const { +  IsX2 = false; +  IsOffen = false; + +  switch (I.getOpcode()) { +  case AMDGPU::BUFFER_STORE_DWORD_OFFEN: +    IsOffen = true; +    return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN; +  case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: +    IsOffen = true; +    return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact; +  case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: +    IsX2 = true; +    IsOffen = true; +    return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN; +  case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact: +    IsX2 = true; +    IsOffen = true; +    return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact; +  case AMDGPU::BUFFER_STORE_DWORD_OFFSET: +    return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; +  case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: +    return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact; +  case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: +    IsX2 = true; +    return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; +  case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact: +    IsX2 = true; +    return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact; +  } +  return 0; +} + +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( +  CombineInfo &CI) { +  MachineBasicBlock *MBB = CI.I->getParent(); +  DebugLoc DL = CI.I->getDebugLoc(); +  bool Unused1, Unused2; +  unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2); + +  unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; +  unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; + +  // Handle descending offsets +  if (CI.Offset0 > CI.Offset1) +    std::swap(SubRegIdx0, SubRegIdx1); + +  // Copy to the new source register. +  const TargetRegisterClass *SuperRC = +    CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; +  unsigned SrcReg = MRI->createVirtualRegister(SuperRC); + +  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); +  const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); + +  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) +      .add(*Src0) +      .addImm(SubRegIdx0) +      .add(*Src1) +      .addImm(SubRegIdx1); + +  auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode)) +      .addReg(SrcReg, RegState::Kill); + +  if (CI.InstClass == BUFFER_STORE_OFFEN) +      MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); + +  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) +      .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) +      .addImm(std::min(CI.Offset0, CI.Offset1)) // offset +      .addImm(CI.GLC0)      // glc +      .addImm(CI.SLC0)      // slc +      .addImm(0)            // tfe +      .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); + +  moveInstsAfter(MIB, CI.InstsToMove); + +  MachineBasicBlock::iterator Next = std::next(CI.I); +  CI.I->eraseFromParent(); +  CI.Paired->eraseFromParent(); +  return Next; +} +  // Scan through looking for adjacent LDS operations with constant offsets from  // the same base register. We rely on the scheduler to do the hard work of  // clustering nearby loads, and assume these are all adjacent. @@ -498,9 +818,14 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {      CombineInfo CI;      CI.I = I;      unsigned Opc = MI.getOpcode(); -    if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) { -      CI.EltSize = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4; -      if (findMatchingDSInst(CI)) { +    if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 || +        Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) { + +      CI.InstClass = DS_READ_WRITE; +      CI.EltSize = +        (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4; + +      if (findMatchingInst(CI)) {          Modified = true;          I = mergeRead2Pair(CI);        } else { @@ -508,9 +833,14 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {        }        continue; -    } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) { -      CI.EltSize = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4; -      if (findMatchingDSInst(CI)) { +    } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 || +               Opc == AMDGPU::DS_WRITE_B32_gfx9 || +               Opc == AMDGPU::DS_WRITE_B64_gfx9) { +      CI.InstClass = DS_READ_WRITE; +      CI.EltSize +        = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4; + +      if (findMatchingInst(CI)) {          Modified = true;          I = mergeWrite2Pair(CI);        } else { @@ -519,6 +849,62 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {        continue;      } +    if (STM->hasSBufferLoadStoreAtomicDwordxN() && +        (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM || +         Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM)) { +      // EltSize is in units of the offset encoding. +      CI.InstClass = S_BUFFER_LOAD_IMM; +      CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4); +      CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; +      if (findMatchingInst(CI)) { +        Modified = true; +        I = mergeSBufferLoadImmPair(CI); +        if (!CI.IsX2) +          CreatedX2++; +      } else { +        ++I; +      } +      continue; +    } +    if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN || +        Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN || +        Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET || +        Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) { +      if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN || +          Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN) +        CI.InstClass = BUFFER_LOAD_OFFEN; +      else +        CI.InstClass = BUFFER_LOAD_OFFSET; + +      CI.EltSize = 4; +      CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN || +                Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; +      if (findMatchingInst(CI)) { +        Modified = true; +        I = mergeBufferLoadPair(CI); +        if (!CI.IsX2) +          CreatedX2++; +      } else { +        ++I; +      } +      continue; +    } + +    bool StoreIsX2, IsOffen; +    if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) { +      CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET; +      CI.EltSize = 4; +      CI.IsX2 = StoreIsX2; +      if (findMatchingInst(CI)) { +        Modified = true; +        I = mergeBufferStorePair(CI); +        if (!CI.IsX2) +          CreatedX2++; +      } else { +        ++I; +      } +      continue; +    }      ++I;    } @@ -527,25 +913,33 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {  }  bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { -  if (skipFunction(*MF.getFunction())) +  if (skipFunction(MF.getFunction()))      return false; -  const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); -  if (!STM.loadStoreOptEnabled()) +  STM = &MF.getSubtarget<SISubtarget>(); +  if (!STM->loadStoreOptEnabled())      return false; -  TII = STM.getInstrInfo(); +  TII = STM->getInstrInfo();    TRI = &TII->getRegisterInfo();    MRI = &MF.getRegInfo();    AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); +  assert(MRI->isSSA() && "Must be run on SSA"); +    DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");    bool Modified = false; -  for (MachineBasicBlock &MBB : MF) +  for (MachineBasicBlock &MBB : MF) { +    CreatedX2 = 0;      Modified |= optimizeBlock(MBB); +    // Run again to convert x2 to x4. +    if (CreatedX2 >= 1) +      Modified |= optimizeBlock(MBB); +  } +    return Modified;  }  | 
