1 files changed, 506 insertions, 336 deletions
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 44bdbe37dec0..6d4e1d2c898b 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -79,6 +79,13 @@ enum InstClassEnum {
   MIMG,
   TBUFFER_LOAD,
   TBUFFER_STORE,
+  GLOBAL_LOAD_SADDR,
+  GLOBAL_STORE_SADDR,
+  FLAT_LOAD,
+  FLAT_STORE,
+  GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
+  GLOBAL_STORE // any CombineInfo, they are only ever returned by
+               // getCommonInstClass.
 };
 
 struct AddressRegs {
@@ -86,6 +93,7 @@ struct AddressRegs {
   bool SBase = false;
   bool SRsrc = false;
   bool SOffset = false;
+  bool SAddr = false;
   bool VAddr = false;
   bool Addr = false;
   bool SSamp = false;
@@ -160,6 +168,11 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
     }
 
     void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
+
+    // Compare by pointer order.
+    bool operator<(const CombineInfo& Other) const {
+      return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
+    }
   };
 
   struct BaseRegisters {
@@ -185,6 +198,9 @@ private:
   AliasAnalysis *AA = nullptr;
   bool OptimizeAgain;
 
+  bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
+                           const DenseSet<Register> &ARegUses,
+                           const MachineInstr &A, const MachineInstr &B) const;
   static bool dmasksCanBeCombined(const CombineInfo &CI,
                                   const SIInstrInfo &TII,
                                   const CombineInfo &Paired);
@@ -199,38 +215,43 @@ private:
                                                     const CombineInfo &Paired);
   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
 
-  bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired,
-                            SmallVectorImpl<MachineInstr *> &InstsToMove);
+  CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
 
   unsigned read2Opcode(unsigned EltSize) const;
   unsigned read2ST64Opcode(unsigned EltSize) const;
-  MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI,
-                                             CombineInfo &Paired,
-                  const SmallVectorImpl<MachineInstr *> &InstsToMove);
+  MachineBasicBlock::iterator
+  mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
+                 MachineBasicBlock::iterator InsertBefore);
 
   unsigned write2Opcode(unsigned EltSize) const;
   unsigned write2ST64Opcode(unsigned EltSize) const;
   MachineBasicBlock::iterator
   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
-                  const SmallVectorImpl<MachineInstr *> &InstsToMove);
+                  MachineBasicBlock::iterator InsertBefore);
   MachineBasicBlock::iterator
   mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
-                 const SmallVectorImpl<MachineInstr *> &InstsToMove);
+                 MachineBasicBlock::iterator InsertBefore);
   MachineBasicBlock::iterator
   mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
-                          const SmallVectorImpl<MachineInstr *> &InstsToMove);
+                          MachineBasicBlock::iterator InsertBefore);
   MachineBasicBlock::iterator
   mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
-                      const SmallVectorImpl<MachineInstr *> &InstsToMove);
+                      MachineBasicBlock::iterator InsertBefore);
   MachineBasicBlock::iterator
   mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
-                       const SmallVectorImpl<MachineInstr *> &InstsToMove);
+                       MachineBasicBlock::iterator InsertBefore);
   MachineBasicBlock::iterator
   mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
-                       const SmallVectorImpl<MachineInstr *> &InstsToMove);
+                       MachineBasicBlock::iterator InsertBefore);
   MachineBasicBlock::iterator
   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
-                        const SmallVectorImpl<MachineInstr *> &InstsToMove);
+                        MachineBasicBlock::iterator InsertBefore);
+  MachineBasicBlock::iterator
+  mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
+                    MachineBasicBlock::iterator InsertBefore);
+  MachineBasicBlock::iterator
+  mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
+                     MachineBasicBlock::iterator InsertBefore);
 
   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
                            int32_t NewOffset) const;
@@ -252,6 +273,12 @@ private:
       MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
       std::list<std::list<CombineInfo>> &MergeableInsts) const;
 
+  static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
+                                                     const CombineInfo &Paired);
+
+  static InstClassEnum getCommonInstClass(const CombineInfo &CI,
+                                          const CombineInfo &Paired);
+
 public:
   static char ID;
 
@@ -298,10 +325,35 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
 
   switch (Opc) {
   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+  case AMDGPU::GLOBAL_LOAD_DWORD:
+  case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORD:
+  case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
+  case AMDGPU::FLAT_LOAD_DWORD:
+  case AMDGPU::FLAT_STORE_DWORD:
     return 1;
   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+  case AMDGPU::GLOBAL_LOAD_DWORDX2:
+  case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX2:
+  case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
+  case AMDGPU::FLAT_LOAD_DWORDX2:
+  case AMDGPU::FLAT_STORE_DWORDX2:
     return 2;
+  case AMDGPU::GLOBAL_LOAD_DWORDX3:
+  case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX3:
+  case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
+  case AMDGPU::FLAT_LOAD_DWORDX3:
+  case AMDGPU::FLAT_STORE_DWORDX3:
+    return 3;
   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+  case AMDGPU::GLOBAL_LOAD_DWORDX4:
+  case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX4:
+  case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
+  case AMDGPU::FLAT_LOAD_DWORDX4:
+  case AMDGPU::FLAT_STORE_DWORDX4:
     return 4;
   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
     return 8;
@@ -386,11 +438,40 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::DS_WRITE_B64:
   case AMDGPU::DS_WRITE_B64_gfx9:
     return DS_WRITE;
+  case AMDGPU::GLOBAL_LOAD_DWORD:
+  case AMDGPU::GLOBAL_LOAD_DWORDX2:
+  case AMDGPU::GLOBAL_LOAD_DWORDX3:
+  case AMDGPU::GLOBAL_LOAD_DWORDX4:
+  case AMDGPU::FLAT_LOAD_DWORD:
+  case AMDGPU::FLAT_LOAD_DWORDX2:
+  case AMDGPU::FLAT_LOAD_DWORDX3:
+  case AMDGPU::FLAT_LOAD_DWORDX4:
+    return FLAT_LOAD;
+  case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
+  case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
+  case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
+  case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
+    return GLOBAL_LOAD_SADDR;
+  case AMDGPU::GLOBAL_STORE_DWORD:
+  case AMDGPU::GLOBAL_STORE_DWORDX2:
+  case AMDGPU::GLOBAL_STORE_DWORDX3:
+  case AMDGPU::GLOBAL_STORE_DWORDX4:
+  case AMDGPU::FLAT_STORE_DWORD:
+  case AMDGPU::FLAT_STORE_DWORDX2:
+  case AMDGPU::FLAT_STORE_DWORDX3:
+  case AMDGPU::FLAT_STORE_DWORDX4:
+    return FLAT_STORE;
+  case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
+    return GLOBAL_STORE_SADDR;
   }
 }
 
 /// Determines instruction subclass from opcode. Only instructions
-/// of the same subclass can be merged together.
+/// of the same subclass can be merged together. The merged instruction may have
+/// a different subclass but must have the same class.
 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
   switch (Opc) {
   default:
@@ -418,9 +499,55 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
+  case AMDGPU::GLOBAL_LOAD_DWORD:
+  case AMDGPU::GLOBAL_LOAD_DWORDX2:
+  case AMDGPU::GLOBAL_LOAD_DWORDX3:
+  case AMDGPU::GLOBAL_LOAD_DWORDX4:
+  case AMDGPU::FLAT_LOAD_DWORD:
+  case AMDGPU::FLAT_LOAD_DWORDX2:
+  case AMDGPU::FLAT_LOAD_DWORDX3:
+  case AMDGPU::FLAT_LOAD_DWORDX4:
+    return AMDGPU::FLAT_LOAD_DWORD;
+  case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
+  case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
+  case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
+  case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
+    return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
+  case AMDGPU::GLOBAL_STORE_DWORD:
+  case AMDGPU::GLOBAL_STORE_DWORDX2:
+  case AMDGPU::GLOBAL_STORE_DWORDX3:
+  case AMDGPU::GLOBAL_STORE_DWORDX4:
+  case AMDGPU::FLAT_STORE_DWORD:
+  case AMDGPU::FLAT_STORE_DWORDX2:
+  case AMDGPU::FLAT_STORE_DWORDX3:
+  case AMDGPU::FLAT_STORE_DWORDX4:
+    return AMDGPU::FLAT_STORE_DWORD;
+  case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
+    return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
   }
 }
 
+// GLOBAL loads and stores are classified as FLAT initially. If both combined
+// instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
+// If either or both instructions are non segment specific FLAT the resulting
+// combined operation will be FLAT, potentially promoting one of the GLOBAL
+// operations to FLAT.
+// For other instructions return the original unmodified class.
+InstClassEnum
+SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
+                                         const CombineInfo &Paired) {
+  assert(CI.InstClass == Paired.InstClass);
+
+  if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
+      SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
+    return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
+
+  return CI.InstClass;
+}
+
 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
   AddressRegs Result;
 
@@ -480,6 +607,34 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::DS_WRITE_B64_gfx9:
     Result.Addr = true;
     return Result;
+  case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
+  case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
+  case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
+  case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
+    Result.SAddr = true;
+    LLVM_FALLTHROUGH;
+  case AMDGPU::GLOBAL_LOAD_DWORD:
+  case AMDGPU::GLOBAL_LOAD_DWORDX2:
+  case AMDGPU::GLOBAL_LOAD_DWORDX3:
+  case AMDGPU::GLOBAL_LOAD_DWORDX4:
+  case AMDGPU::GLOBAL_STORE_DWORD:
+  case AMDGPU::GLOBAL_STORE_DWORDX2:
+  case AMDGPU::GLOBAL_STORE_DWORDX3:
+  case AMDGPU::GLOBAL_STORE_DWORDX4:
+  case AMDGPU::FLAT_LOAD_DWORD:
+  case AMDGPU::FLAT_LOAD_DWORDX2:
+  case AMDGPU::FLAT_LOAD_DWORDX3:
+  case AMDGPU::FLAT_LOAD_DWORDX4:
+  case AMDGPU::FLAT_STORE_DWORD:
+  case AMDGPU::FLAT_STORE_DWORDX2:
+  case AMDGPU::FLAT_STORE_DWORDX3:
+  case AMDGPU::FLAT_STORE_DWORDX4:
+    Result.VAddr = true;
+    return Result;
   }
 }
 
@@ -551,6 +706,9 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
   if (Regs.SOffset)
     AddrIdx[NumAddresses++] =
         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
+  if (Regs.SAddr)
+    AddrIdx[NumAddresses++] =
+        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
   if (Regs.VAddr)
     AddrIdx[NumAddresses++] =
         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
@@ -579,92 +737,58 @@ FunctionPass *llvm::createSILoadStoreOptimizerPass() {
   return new SILoadStoreOptimizer();
 }
 
-static void moveInstsAfter(MachineBasicBlock::iterator I,
-                           ArrayRef<MachineInstr *> InstsToMove) {
-  MachineBasicBlock *MBB = I->getParent();
-  ++I;
-  for (MachineInstr *MI : InstsToMove) {
-    MI->removeFromParent();
-    MBB->insert(I, MI);
-  }
-}
-
 static void addDefsUsesToList(const MachineInstr &MI,
                               DenseSet<Register> &RegDefs,
-                              DenseSet<Register> &PhysRegUses) {
-  for (const MachineOperand &Op : MI.operands()) {
-    if (Op.isReg()) {
-      if (Op.isDef())
-        RegDefs.insert(Op.getReg());
-      else if (Op.readsReg() && Op.getReg().isPhysical())
-        PhysRegUses.insert(Op.getReg());
-    }
+                              DenseSet<Register> &RegUses) {
+  for (const auto &Op : MI.operands()) {
+    if (!Op.isReg())
+      continue;
+    if (Op.isDef())
+      RegDefs.insert(Op.getReg());
+    if (Op.readsReg())
+      RegUses.insert(Op.getReg());
   }
 }
 
-static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
-                                      MachineBasicBlock::iterator B,
-                                      AliasAnalysis *AA) {
-  // RAW or WAR - cannot reorder
-  // WAW - cannot reorder
-  // RAR - safe to reorder
-  return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
-}
-
-// Add MI and its defs to the lists if MI reads one of the defs that are
-// already in the list. Returns true in that case.
-static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs,
-                                  DenseSet<Register> &PhysRegUses,
-                                  SmallVectorImpl<MachineInstr *> &Insts) {
-  for (MachineOperand &Use : MI.operands()) {
-    // If one of the defs is read, then there is a use of Def between I and the
-    // instruction that I will potentially be merged with. We will need to move
-    // this instruction after the merged instructions.
-    //
-    // Similarly, if there is a def which is read by an instruction that is to
-    // be moved for merging, then we need to move the def-instruction as well.
-    // This can only happen for physical registers such as M0; virtual
-    // registers are in SSA form.
-    if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
-                        (Use.isDef() && RegDefs.count(Use.getReg())) ||
-                        (Use.isDef() && Use.getReg().isPhysical() &&
-                         PhysRegUses.count(Use.getReg())))) {
-      Insts.push_back(&MI);
-      addDefsUsesToList(MI, RegDefs, PhysRegUses);
-      return true;
-    }
-  }
-
-  return false;
-}
-
-static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
-                                    ArrayRef<MachineInstr *> InstsToMove,
-                                    AliasAnalysis *AA) {
-  assert(MemOp.mayLoadOrStore());
-
-  for (MachineInstr *InstToMove : InstsToMove) {
-    if (!InstToMove->mayLoadOrStore())
+bool SILoadStoreOptimizer::canSwapInstructions(
+    const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
+    const MachineInstr &A, const MachineInstr &B) const {
+  if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
+      (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
+    return false;
+  for (const auto &BOp : B.operands()) {
+    if (!BOp.isReg())
       continue;
-    if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
+    if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
+      return false;
+    if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
       return false;
   }
   return true;
 }
 
-// This function assumes that \p A and \p B have are identical except for
-// size and offset, and they reference adjacent memory.
-static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,
-                                                   const MachineMemOperand *A,
-                                                   const MachineMemOperand *B) {
-  unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
-  unsigned Size = A->getSize() + B->getSize();
-  // This function adds the offset parameter to the existing offset for A,
-  // so we pass 0 here as the offset and then manually set it to the correct
-  // value after the call.
-  MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size);
-  MMO->setOffset(MinOffset);
-  return MMO;
+// Given that \p CI and \p Paired are adjacent memory operations produce a new
+// MMO for the combined operation with a new access size.
+MachineMemOperand *
+SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
+                                               const CombineInfo &Paired) {
+  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
+  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
+
+  unsigned Size = MMOa->getSize() + MMOb->getSize();
+
+  // A base pointer for the combined operation is the same as the leading
+  // operation's pointer.
+  if (Paired < CI)
+    std::swap(MMOa, MMOb);
+
+  MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
+  // If merging FLAT and GLOBAL set address space to FLAT.
+  if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
+    PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
+
+  MachineFunction *MF = CI.I->getMF();
+  return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
 }
 
 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
@@ -787,8 +911,7 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
     return (EltOffset0 + CI.Width == EltOffset1 ||
             EltOffset1 + Paired.Width == EltOffset0) &&
-           CI.CPol == Paired.CPol &&
-           (CI.InstClass == S_BUFFER_LOAD_IMM || CI.CPol == Paired.CPol);
+           CI.CPol == Paired.CPol;
   }
 
   // If the offset in elements doesn't fit in 8-bits, we might be able to use
@@ -889,111 +1012,59 @@ SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
   return nullptr;
 }
 
-/// This function assumes that CI comes before Paired in a basic block.
-bool SILoadStoreOptimizer::checkAndPrepareMerge(
-    CombineInfo &CI, CombineInfo &Paired,
-    SmallVectorImpl<MachineInstr *> &InstsToMove) {
+/// This function assumes that CI comes before Paired in a basic block. Return
+/// an insertion point for the merged instruction or nullptr on failure.
+SILoadStoreOptimizer::CombineInfo *
+SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
+                                           CombineInfo &Paired) {
+  // If another instruction has already been merged into CI, it may now be a
+  // type that we can't do any further merging into.
+  if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
+    return nullptr;
+  assert(CI.InstClass == Paired.InstClass);
+
+  if (getInstSubclass(CI.I->getOpcode(), *TII) !=
+      getInstSubclass(Paired.I->getOpcode(), *TII))
+    return nullptr;
 
   // Check both offsets (or masks for MIMG) can be combined and fit in the
   // reduced range.
-  if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired))
-    return false;
-
-  if (CI.InstClass != MIMG &&
-      (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)))
-    return false;
-
-  const unsigned Opc = CI.I->getOpcode();
-  const InstClassEnum InstClass = getInstClass(Opc, *TII);
-
-  if (InstClass == UNKNOWN) {
-    return false;
+  if (CI.InstClass == MIMG) {
+    if (!dmasksCanBeCombined(CI, *TII, Paired))
+      return nullptr;
+  } else {
+    if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
+      return nullptr;
   }
-  const unsigned InstSubclass = getInstSubclass(Opc, *TII);
-
-  DenseSet<Register> RegDefsToMove;
-  DenseSet<Register> PhysRegUsesToMove;
-  addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
-
-  MachineBasicBlock::iterator E = std::next(Paired.I);
-  MachineBasicBlock::iterator MBBI = std::next(CI.I);
-  MachineBasicBlock::iterator MBBE = CI.I->getParent()->end();
-  for (; MBBI != E; ++MBBI) {
-
-    if (MBBI == MBBE) {
-      // CombineInfo::Order is a hint on the instruction ordering within the
-      // basic block. This hint suggests that CI precedes Paired, which is
-      // true most of the time. However, moveInstsAfter() processing a
-      // previous list may have changed this order in a situation when it
-      // moves an instruction which exists in some other merge list.
-      // In this case it must be dependent.
-      return false;
-    }
 
-    if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) ||
-        (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) {
-      // This is not a matching instruction, but we can keep looking as
-      // long as one of these conditions are met:
-      // 1. It is safe to move I down past MBBI.
-      // 2. It is safe to move MBBI down past the instruction that I will
-      //    be merged into.
-
-      if (MBBI->mayLoadOrStore() &&
-          (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
-           !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) {
-        // We fail condition #1, but we may still be able to satisfy condition
-        // #2.  Add this instruction to the move list and then we will check
-        // if condition #2 holds once we have selected the matching instruction.
-        InstsToMove.push_back(&*MBBI);
-        addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
-        continue;
-      }
-
-      // When we match I with another DS instruction we will be moving I down
-      // to the location of the matched instruction any uses of I will need to
-      // be moved down as well.
-      addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
-                            InstsToMove);
-      continue;
+  DenseSet<Register> RegDefs;
+  DenseSet<Register> RegUses;
+  CombineInfo *Where;
+  if (CI.I->mayLoad()) {
+    // Try to hoist Paired up to CI.
+    addDefsUsesToList(*Paired.I, RegDefs, RegUses);
+    for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
+      if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
+        return nullptr;
     }
-
-    // Handle a case like
-    //   DS_WRITE_B32 addr, v, idx0
-    //   w = DS_READ_B32 addr, idx0
-    //   DS_WRITE_B32 addr, f(w), idx1
-    // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
-    // merging of the two writes.
-    if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
-                              InstsToMove))
-      continue;
-
-    if (&*MBBI == &*Paired.I) {
-      // We need to go through the list of instructions that we plan to
-      // move and make sure they are all safe to move down past the merged
-      // instruction.
-      if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) {
-
-        // Call offsetsCanBeCombined with modify = true so that the offsets are
-        // correct for the new instruction.  This should return true, because
-        // this function should only be called on CombineInfo objects that
-        // have already been confirmed to be mergeable.
-        if (CI.InstClass != MIMG)
-          offsetsCanBeCombined(CI, *STM, Paired, true);
-        return true;
-      }
-      return false;
+    Where = &CI;
+  } else {
+    // Try to sink CI down to Paired.
+    addDefsUsesToList(*CI.I, RegDefs, RegUses);
+    for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
+      if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
+        return nullptr;
     }
-
-    // We've found a load/store that we couldn't merge for some reason.
-    // We could potentially keep looking, but we'd need to make sure that
-    // it was safe to move I and also all the instruction in InstsToMove
-    // down past this instruction.
-    // check if we can move I across MBBI and if we can move all I's users
-    if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
-        !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))
-      break;
+    Where = &Paired;
   }
-  return false;
+
+  // Call offsetsCanBeCombined with modify = true so that the offsets are
+  // correct for the new instruction.  This should return true, because
+  // this function should only be called on CombineInfo objects that
+  // have already been confirmed to be mergeable.
+  if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
+    offsetsCanBeCombined(CI, *STM, Paired, true);
+  return Where;
 }
 
 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
@@ -1012,7 +1083,7 @@ unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
 
 MachineBasicBlock::iterator
 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
-    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+                                     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
 
   // Be careful, since the addresses could be subregisters themselves in weird
@@ -1051,13 +1122,13 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
   unsigned BaseRegFlags = 0;
   if (CI.BaseOff) {
     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
-    BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
+    BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
         .addImm(CI.BaseOff);
 
     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     BaseRegFlags = RegState::Kill;
 
-    TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
+    TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
         .addReg(ImmReg)
         .addReg(AddrReg->getReg(), 0, BaseSubReg)
         .addImm(0); // clamp bit
@@ -1065,7 +1136,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
   }
 
   MachineInstrBuilder Read2 =
-      BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg)
+      BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
           .addImm(NewOffset0)                        // offset0
           .addImm(NewOffset1)                        // offset1
@@ -1077,14 +1148,12 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
 
   // Copy to the old destination registers.
-  BuildMI(*MBB, Paired.I, DL, CopyDesc)
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
       .add(*Dest0) // Copy to same destination including flags and sub reg.
       .addReg(DestReg, 0, SubRegIdx0);
-  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
-                            .add(*Dest1)
-                            .addReg(DestReg, RegState::Kill, SubRegIdx1);
-
-  moveInstsAfter(Copy1, InstsToMove);
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+      .add(*Dest1)
+      .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1109,9 +1178,9 @@ unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
 }
 
-MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
-                                      const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
+    CombineInfo &CI, CombineInfo &Paired,
+    MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
 
   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
@@ -1145,13 +1214,13 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
   unsigned BaseRegFlags = 0;
   if (CI.BaseOff) {
     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
-    BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
+    BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
         .addImm(CI.BaseOff);
 
     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     BaseRegFlags = RegState::Kill;
 
-    TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
+    TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
         .addReg(ImmReg)
         .addReg(AddrReg->getReg(), 0, BaseSubReg)
         .addImm(0); // clamp bit
@@ -1159,7 +1228,7 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
   }
 
   MachineInstrBuilder Write2 =
-      BuildMI(*MBB, Paired.I, DL, Write2Desc)
+      BuildMI(*MBB, InsertBefore, DL, Write2Desc)
           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
           .add(*Data0)                               // data0
           .add(*Data1)                               // data1
@@ -1168,8 +1237,6 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
           .addImm(0)                                 // gds
           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
 
-  moveInstsAfter(Write2, InstsToMove);
-
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
 
@@ -1179,7 +1246,7 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
 
 MachineBasicBlock::iterator
 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
-                           const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+                                     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
   const unsigned Opcode = getNewOpcode(CI, Paired);
@@ -1191,7 +1258,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
   unsigned DMaskIdx =
       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
 
-  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
+  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
     if (I == DMaskIdx)
       MIB.addImm(MergedDMask);
@@ -1204,10 +1271,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
   // will return true if this is the case.
   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
 
-  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
-  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
-
-  MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+  MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
   unsigned SubRegIdx0, SubRegIdx1;
   std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
@@ -1217,14 +1281,12 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
 
-  BuildMI(*MBB, Paired.I, DL, CopyDesc)
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
       .add(*Dest0) // Copy to same destination including flags and sub reg.
       .addReg(DestReg, 0, SubRegIdx0);
-  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
-                            .add(*Dest1)
-                            .addReg(DestReg, RegState::Kill, SubRegIdx1);
-
-  moveInstsAfter(Copy1, InstsToMove);
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+      .add(*Dest1)
+      .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1233,7 +1295,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
 
 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
     CombineInfo &CI, CombineInfo &Paired,
-    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+    MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
   const unsigned Opcode = getNewOpcode(CI, Paired);
@@ -1248,15 +1310,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
   // will return true if this is the case.
   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
 
-  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
-  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
-
   MachineInstr *New =
-    BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg)
-        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
-        .addImm(MergedOffset) // offset
-        .addImm(CI.CPol)      // cpol
-        .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+      BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
+          .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
+          .addImm(MergedOffset) // offset
+          .addImm(CI.CPol)      // cpol
+          .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
@@ -1267,14 +1326,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
 
-  BuildMI(*MBB, Paired.I, DL, CopyDesc)
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
       .add(*Dest0) // Copy to same destination including flags and sub reg.
       .addReg(DestReg, 0, SubRegIdx0);
-  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
-                            .add(*Dest1)
-                            .addReg(DestReg, RegState::Kill, SubRegIdx1);
-
-  moveInstsAfter(Copy1, InstsToMove);
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+      .add(*Dest1)
+      .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1283,7 +1340,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
 
 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
     CombineInfo &CI, CombineInfo &Paired,
-    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+    MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
 
@@ -1295,7 +1352,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
   Register DestReg = MRI->createVirtualRegister(SuperRC);
   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
 
-  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
+  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
 
   AddressRegs Regs = getRegs(Opcode, *TII);
 
@@ -1307,9 +1364,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
   // will return true if this is the case.
   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
 
-  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
-  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
-
   MachineInstr *New =
     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
@@ -1317,7 +1371,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
         .addImm(CI.CPol)      // cpol
         .addImm(0)            // tfe
         .addImm(0)            // swz
-        .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
@@ -1328,14 +1382,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
 
-  BuildMI(*MBB, Paired.I, DL, CopyDesc)
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
       .add(*Dest0) // Copy to same destination including flags and sub reg.
       .addReg(DestReg, 0, SubRegIdx0);
-  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
-                            .add(*Dest1)
-                            .addReg(DestReg, RegState::Kill, SubRegIdx1);
-
-  moveInstsAfter(Copy1, InstsToMove);
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+      .add(*Dest1)
+      .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1344,7 +1396,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
 
 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
     CombineInfo &CI, CombineInfo &Paired,
-    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+    MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
 
@@ -1356,7 +1408,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
   Register DestReg = MRI->createVirtualRegister(SuperRC);
   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
 
-  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
+  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
 
   AddressRegs Regs = getRegs(Opcode, *TII);
 
@@ -1371,9 +1423,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
   // will return true if this is the case.
   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
 
-  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
-  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
-
   MachineInstr *New =
       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
@@ -1382,8 +1431,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
           .addImm(CI.CPol)      // cpol
           .addImm(0)            // tfe
           .addImm(0)            // swz
-          .addMemOperand(
-              combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+          .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
@@ -1394,14 +1442,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
 
-  BuildMI(*MBB, Paired.I, DL, CopyDesc)
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
       .add(*Dest0) // Copy to same destination including flags and sub reg.
       .addReg(DestReg, 0, SubRegIdx0);
-  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
-                            .add(*Dest1)
-                            .addReg(DestReg, RegState::Kill, SubRegIdx1);
-
-  moveInstsAfter(Copy1, InstsToMove);
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+      .add(*Dest1)
+      .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1410,7 +1456,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
 
 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
     CombineInfo &CI, CombineInfo &Paired,
-    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+    MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
 
@@ -1427,13 +1473,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
 
-  BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
+  BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
       .add(*Src0)
       .addImm(SubRegIdx0)
       .add(*Src1)
       .addImm(SubRegIdx1);
 
-  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
+  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
                  .addReg(SrcReg, RegState::Kill);
 
   AddressRegs Regs = getRegs(Opcode, *TII);
@@ -1449,9 +1495,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
   // will return true if this is the case.
   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
 
-  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
-  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
-
   MachineInstr *New =
       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
@@ -1460,10 +1503,92 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
           .addImm(CI.CPol)                          // cpol
           .addImm(0)                                // tfe
           .addImm(0)                                // swz
-          .addMemOperand(
-              combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+          .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
+
+  CI.I->eraseFromParent();
+  Paired.I->eraseFromParent();
+  return New;
+}
+
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
+    CombineInfo &CI, CombineInfo &Paired,
+    MachineBasicBlock::iterator InsertBefore) {
+  MachineBasicBlock *MBB = CI.I->getParent();
+  DebugLoc DL = CI.I->getDebugLoc();
 
-  moveInstsAfter(MIB, InstsToMove);
+  const unsigned Opcode = getNewOpcode(CI, Paired);
+
+  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
+  Register DestReg = MRI->createVirtualRegister(SuperRC);
+
+  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
+
+  if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
+    MIB.add(*SAddr);
+
+  MachineInstr *New =
+    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
+       .addImm(std::min(CI.Offset, Paired.Offset))
+       .addImm(CI.CPol)
+       .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
+
+  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
+  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
+  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
+
+  // Copy to the old destination registers.
+  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
+  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
+  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
+
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+      .add(*Dest0) // Copy to same destination including flags and sub reg.
+      .addReg(DestReg, 0, SubRegIdx0);
+  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
+      .add(*Dest1)
+      .addReg(DestReg, RegState::Kill, SubRegIdx1);
+
+  CI.I->eraseFromParent();
+  Paired.I->eraseFromParent();
+  return New;
+}
+
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
+    CombineInfo &CI, CombineInfo &Paired,
+    MachineBasicBlock::iterator InsertBefore) {
+  MachineBasicBlock *MBB = CI.I->getParent();
+  DebugLoc DL = CI.I->getDebugLoc();
+
+  const unsigned Opcode = getNewOpcode(CI, Paired);
+
+  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
+  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
+  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
+
+  // Copy to the new source register.
+  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
+  Register SrcReg = MRI->createVirtualRegister(SuperRC);
+
+  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
+  const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
+
+  BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
+      .add(*Src0)
+      .addImm(SubRegIdx0)
+      .add(*Src1)
+      .addImm(SubRegIdx1);
+
+  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
+                 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
+                 .addReg(SrcReg, RegState::Kill);
+
+  if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
+    MIB.add(*SAddr);
+
+  MachineInstr *New =
+    MIB.addImm(std::min(CI.Offset, Paired.Offset))
+       .addImm(CI.CPol)
+       .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1474,7 +1599,7 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
                                             const CombineInfo &Paired) {
   const unsigned Width = CI.Width + Paired.Width;
 
-  switch (CI.InstClass) {
+  switch (getCommonInstClass(CI, Paired)) {
   default:
     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
     // FIXME: Handle d16 correctly
@@ -1498,6 +1623,72 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
     case 8:
       return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
     }
+  case GLOBAL_LOAD:
+    switch (Width) {
+    default:
+      return 0;
+    case 2:
+      return AMDGPU::GLOBAL_LOAD_DWORDX2;
+    case 3:
+      return AMDGPU::GLOBAL_LOAD_DWORDX3;
+    case 4:
+      return AMDGPU::GLOBAL_LOAD_DWORDX4;
+    }
+  case GLOBAL_LOAD_SADDR:
+    switch (Width) {
+    default:
+      return 0;
+    case 2:
+      return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
+    case 3:
+      return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
+    case 4:
+      return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
+    }
+  case GLOBAL_STORE:
+    switch (Width) {
+    default:
+      return 0;
+    case 2:
+      return AMDGPU::GLOBAL_STORE_DWORDX2;
+    case 3:
+      return AMDGPU::GLOBAL_STORE_DWORDX3;
+    case 4:
+      return AMDGPU::GLOBAL_STORE_DWORDX4;
+    }
+  case GLOBAL_STORE_SADDR:
+    switch (Width) {
+    default:
+      return 0;
+    case 2:
+      return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
+    case 3:
+      return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
+    case 4:
+      return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
+    }
+  case FLAT_LOAD:
+    switch (Width) {
+    default:
+      return 0;
+    case 2:
+      return AMDGPU::FLAT_LOAD_DWORDX2;
+    case 3:
+      return AMDGPU::FLAT_LOAD_DWORDX3;
+    case 4:
+      return AMDGPU::FLAT_LOAD_DWORDX4;
+    }
+  case FLAT_STORE:
+    switch (Width) {
+    default:
+      return 0;
+    case 2:
+      return AMDGPU::FLAT_STORE_DWORDX2;
+    case 3:
+      return AMDGPU::FLAT_STORE_DWORDX3;
+    case 4:
+      return AMDGPU::FLAT_STORE_DWORDX4;
+    }
   case MIMG:
     assert((countPopulation(CI.DMask | Paired.DMask) == Width) &&
            "No overlaps");
@@ -1508,15 +1699,9 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
 std::pair<unsigned, unsigned>
 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
                                     const CombineInfo &Paired) {
-  bool ReverseOrder;
-  if (CI.InstClass == MIMG) {
-    assert(
-        (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&
-        "No overlaps");
-    ReverseOrder = CI.DMask > Paired.DMask;
-  } else {
-    ReverseOrder = CI.Offset > Paired.Offset;
-  }
+  assert((CI.InstClass != MIMG || (countPopulation(CI.DMask | Paired.DMask) ==
+                                   CI.Width + Paired.Width)) &&
+         "No overlaps");
 
   unsigned Idx0;
   unsigned Idx1;
@@ -1532,7 +1717,7 @@ SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
   assert(CI.Width >= 1 && CI.Width <= 4);
   assert(Paired.Width >= 1 && Paired.Width <= 4);
 
-  if (ReverseOrder) {
+  if (Paired < CI) {
     Idx1 = Idxs[0][Paired.Width - 1];
     Idx0 = Idxs[Paired.Width][CI.Width - 1];
   } else {
@@ -1569,7 +1754,7 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
 
 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
     CombineInfo &CI, CombineInfo &Paired,
-    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
+    MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
 
@@ -1586,13 +1771,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
 
-  BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
+  BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
       .add(*Src0)
       .addImm(SubRegIdx0)
       .add(*Src1)
       .addImm(SubRegIdx1);
 
-  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
+  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
                  .addReg(SrcReg, RegState::Kill);
 
   AddressRegs Regs = getRegs(Opcode, *TII);
@@ -1606,9 +1791,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
   // will return true if this is the case.
   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
 
-  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
-  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
-
   MachineInstr *New =
     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
@@ -1616,9 +1798,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
         .addImm(CI.CPol)      // cpol
         .addImm(0)            // tfe
         .addImm(0)            // swz
-        .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
-
-  moveInstsAfter(MIB, InstsToMove);
+        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1846,7 +2026,7 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
   // as the new-base(anchor) because of the maximum distance which can
-  // accomodate more intermediate bases presumeably.
+  // accommodate more intermediate bases presumably.
   //
   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
   // (&a + 8192) for load1, load2, load4.
@@ -2098,8 +2278,8 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
     CombineInfo &CI = *First;
     CombineInfo &Paired = *Second;
 
-    SmallVector<MachineInstr *, 8> InstsToMove;
-    if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) {
+    CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
+    if (!Where) {
       ++I;
       continue;
     }
@@ -2108,66 +2288,56 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
 
     LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
 
+    MachineBasicBlock::iterator NewMI;
     switch (CI.InstClass) {
     default:
       llvm_unreachable("unknown InstClass");
       break;
-    case DS_READ: {
-      MachineBasicBlock::iterator NewMI =
-          mergeRead2Pair(CI, Paired, InstsToMove);
-      CI.setMI(NewMI, *this);
+    case DS_READ:
+      NewMI = mergeRead2Pair(CI, Paired, Where->I);
       break;
-    }
-    case DS_WRITE: {
-      MachineBasicBlock::iterator NewMI =
-          mergeWrite2Pair(CI, Paired, InstsToMove);
-      CI.setMI(NewMI, *this);
+    case DS_WRITE:
+      NewMI = mergeWrite2Pair(CI, Paired, Where->I);
       break;
-    }
-    case S_BUFFER_LOAD_IMM: {
-      MachineBasicBlock::iterator NewMI =
-          mergeSBufferLoadImmPair(CI, Paired, InstsToMove);
-      CI.setMI(NewMI, *this);
-      OptimizeListAgain |= (CI.Width + Paired.Width) < 8;
+    case S_BUFFER_LOAD_IMM:
+      NewMI = mergeSBufferLoadImmPair(CI, Paired, Where->I);
+      OptimizeListAgain |= CI.Width + Paired.Width < 8;
       break;
-    }
-    case BUFFER_LOAD: {
-      MachineBasicBlock::iterator NewMI =
-          mergeBufferLoadPair(CI, Paired, InstsToMove);
-      CI.setMI(NewMI, *this);
-      OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
+    case BUFFER_LOAD:
+      NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
+      OptimizeListAgain |= CI.Width + Paired.Width < 4;
       break;
-    }
-    case BUFFER_STORE: {
-      MachineBasicBlock::iterator NewMI =
-          mergeBufferStorePair(CI, Paired, InstsToMove);
-      CI.setMI(NewMI, *this);
-      OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
+    case BUFFER_STORE:
+      NewMI = mergeBufferStorePair(CI, Paired, Where->I);
+      OptimizeListAgain |= CI.Width + Paired.Width < 4;
       break;
-    }
-    case MIMG: {
-      MachineBasicBlock::iterator NewMI =
-          mergeImagePair(CI, Paired, InstsToMove);
-      CI.setMI(NewMI, *this);
-      OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
+    case MIMG:
+      NewMI = mergeImagePair(CI, Paired, Where->I);
+      OptimizeListAgain |= CI.Width + Paired.Width < 4;
       break;
-    }
-    case TBUFFER_LOAD: {
-      MachineBasicBlock::iterator NewMI =
-          mergeTBufferLoadPair(CI, Paired, InstsToMove);
-      CI.setMI(NewMI, *this);
-      OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
+    case TBUFFER_LOAD:
+      NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
+      OptimizeListAgain |= CI.Width + Paired.Width < 4;
       break;
-    }
-    case TBUFFER_STORE: {
-      MachineBasicBlock::iterator NewMI =
-          mergeTBufferStorePair(CI, Paired, InstsToMove);
-      CI.setMI(NewMI, *this);
-      OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
+    case TBUFFER_STORE:
+      NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
+      OptimizeListAgain |= CI.Width + Paired.Width < 4;
+      break;
+    case FLAT_LOAD:
+    case GLOBAL_LOAD:
+    case GLOBAL_LOAD_SADDR:
+      NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
+      OptimizeListAgain |= CI.Width + Paired.Width < 4;
+      break;
+    case FLAT_STORE:
+    case GLOBAL_STORE:
+    case GLOBAL_STORE_SADDR:
+      NewMI = mergeFlatStorePair(CI, Paired, Where->I);
+      OptimizeListAgain |= CI.Width + Paired.Width < 4;
       break;
     }
-    }
-    CI.Order = Paired.Order;
+    CI.setMI(NewMI, *this);
+    CI.Order = Where->Order;
     if (I == Second)
       I = Next;