1 files changed, 110 insertions, 139 deletions
diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 0f009a48754ad..6bbe5979316da 100644
--- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1,4 +1,4 @@
-//===-- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===/
+//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -21,12 +21,34 @@
 #include "SIDefines.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <utility>
+#include <vector>
 
 #define DEBUG_TYPE "si-insert-waitcnts"
 
@@ -42,7 +64,7 @@ namespace {
 
 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
 
-typedef std::pair<signed, signed> RegInterval;
+using RegInterval = std::pair<signed, signed>;
 
 struct {
   int32_t VmcntMax;
@@ -101,6 +123,15 @@ enum RegisterMapping {
 // "s_waitcnt 0" before use.
 class BlockWaitcntBrackets {
 public:
+  BlockWaitcntBrackets() {
+    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+         T = (enum InstCounterType)(T + 1)) {
+      memset(VgprScores[T], 0, sizeof(VgprScores[T]));
+    }
+  }
+
+  ~BlockWaitcntBrackets() = default;
+
   static int32_t getWaitCountMax(InstCounterType T) {
     switch (T) {
     case VM_CNT:
@@ -113,14 +144,14 @@ public:
       break;
     }
     return 0;
-  };
+  }
 
   void setScoreLB(InstCounterType T, int32_t Val) {
     assert(T < NUM_INST_CNTS);
     if (T >= NUM_INST_CNTS)
       return;
     ScoreLBs[T] = Val;
-  };
+  }
 
   void setScoreUB(InstCounterType T, int32_t Val) {
     assert(T < NUM_INST_CNTS);
@@ -132,21 +163,21 @@ public:
       if (ScoreLBs[T] < UB)
         ScoreLBs[T] = UB;
     }
-  };
+  }
 
   int32_t getScoreLB(InstCounterType T) {
     assert(T < NUM_INST_CNTS);
     if (T >= NUM_INST_CNTS)
       return 0;
     return ScoreLBs[T];
-  };
+  }
 
   int32_t getScoreUB(InstCounterType T) {
     assert(T < NUM_INST_CNTS);
     if (T >= NUM_INST_CNTS)
       return 0;
     return ScoreUBs[T];
-  };
+  }
 
   // Mapping from event to counter.
   InstCounterType eventCounter(WaitEventType E) {
@@ -218,26 +249,18 @@ public:
   void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
   int32_t getMaxVGPR() const { return VgprUB; }
   int32_t getMaxSGPR() const { return SgprUB; }
+
   int32_t getEventUB(enum WaitEventType W) const {
     assert(W < NUM_WAIT_EVENTS);
     return EventUBs[W];
   }
+
   bool counterOutOfOrder(InstCounterType T);
   unsigned int updateByWait(InstCounterType T, int ScoreToWait);
   void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
                      const MachineRegisterInfo *MRI, WaitEventType E,
                      MachineInstr &MI);
 
-  BlockWaitcntBrackets()
-      : WaitAtBeginning(false), RevisitLoop(false), ValidLoop(false), MixedExpTypes(false),
-        LoopRegion(NULL), PostOrder(0), Waitcnt(NULL), VgprUB(0), SgprUB(0) {
-    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-         T = (enum InstCounterType)(T + 1)) {
-      memset(VgprScores[T], 0, sizeof(VgprScores[T]));
-    }
-  }
-  ~BlockWaitcntBrackets(){};
-
   bool hasPendingSMEM() const {
     return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
             EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
@@ -266,7 +289,7 @@ public:
   int32_t getPostOrder() const { return PostOrder; }
 
   void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; }
-  void clearWaitcnt() { Waitcnt = NULL; }
+  void clearWaitcnt() { Waitcnt = nullptr; }
   MachineInstr *getWaitcnt() const { return Waitcnt; }
 
   bool mixedExpTypes() const { return MixedExpTypes; }
@@ -278,13 +301,11 @@ public:
   void dump() { print(dbgs()); }
 
 private:
-  bool WaitAtBeginning;
-  bool RevisitLoop;
-  bool ValidLoop;
-  bool MixedExpTypes;
-  MachineLoop *LoopRegion;
-  int32_t PostOrder;
-  MachineInstr *Waitcnt;
+  bool WaitAtBeginning = false;
+  bool RevisitLoop = false;
+  bool MixedExpTypes = false;
+  int32_t PostOrder = 0;
+  MachineInstr *Waitcnt = nullptr;
   int32_t ScoreLBs[NUM_INST_CNTS] = {0};
   int32_t ScoreUBs[NUM_INST_CNTS] = {0};
   int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
@@ -292,8 +313,8 @@ private:
   int32_t LastFlat[NUM_INST_CNTS] = {0};
   // wait_cnt scores for every vgpr.
   // Keep track of the VgprUB and SgprUB to make merge at join efficient.
-  int32_t VgprUB;
-  int32_t SgprUB;
+  int32_t VgprUB = 0;
+  int32_t SgprUB = 0;
   int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
   // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
   int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
@@ -306,38 +327,36 @@ private:
 // at the end of the loop footer.
 class LoopWaitcntData {
 public:
+  LoopWaitcntData() = default;
+  ~LoopWaitcntData() = default;
+
   void incIterCnt() { IterCnt++; }
   void resetIterCnt() { IterCnt = 0; }
   int32_t getIterCnt() { return IterCnt; }
 
-  LoopWaitcntData() : LfWaitcnt(NULL), IterCnt(0) {}
-  ~LoopWaitcntData(){};
-
   void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
   MachineInstr *getWaitcnt() const { return LfWaitcnt; }
 
   void print() {
     DEBUG(dbgs() << "  iteration " << IterCnt << '\n';);
-    return;
   }
 
 private:
   // s_waitcnt added at the end of loop footer to stablize wait scores
   // at the end of the loop footer.
-  MachineInstr *LfWaitcnt;
+  MachineInstr *LfWaitcnt = nullptr;
   // Number of iterations the loop has been visited, not including the initial
   // walk over.
-  int32_t IterCnt;
+  int32_t IterCnt = 0;
 };
 
 class SIInsertWaitcnts : public MachineFunctionPass {
-
 private:
-  const SISubtarget *ST;
-  const SIInstrInfo *TII;
-  const SIRegisterInfo *TRI;
-  const MachineRegisterInfo *MRI;
-  const MachineLoopInfo *MLI;
+  const SISubtarget *ST = nullptr;
+  const SIInstrInfo *TII = nullptr;
+  const SIRegisterInfo *TRI = nullptr;
+  const MachineRegisterInfo *MRI = nullptr;
+  const MachineLoopInfo *MLI = nullptr;
   AMDGPU::IsaInfo::IsaVersion IV;
   AMDGPUAS AMDGPUASI;
 
@@ -357,9 +376,7 @@ private:
 public:
   static char ID;
 
-  SIInsertWaitcnts()
-      : MachineFunctionPass(ID), ST(nullptr), TII(nullptr), TRI(nullptr),
-        MRI(nullptr), MLI(nullptr) {}
+  SIInsertWaitcnts() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -376,9 +393,11 @@ public:
   void addKillWaitBracket(BlockWaitcntBrackets *Bracket) {
     // The waitcnt information is copied because it changes as the block is
     // traversed.
-    KillWaitBrackets.push_back(make_unique<BlockWaitcntBrackets>(*Bracket));
+    KillWaitBrackets.push_back(
+        llvm::make_unique<BlockWaitcntBrackets>(*Bracket));
   }
 
+  bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
   MachineInstr *generateSWaitCntInstBefore(MachineInstr &MI,
                                            BlockWaitcntBrackets *ScoreBrackets);
   void updateEventWaitCntAfter(MachineInstr &Inst,
@@ -389,7 +408,7 @@ public:
   void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
 };
 
-} // End anonymous namespace.
+} // end anonymous namespace
 
 RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
                                                  const SIInstrInfo *TII,
@@ -567,13 +586,13 @@ void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
     }
 #if 0 // TODO: check if this is handled by MUBUF code above.
   } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
-	     Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
-	     Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
+       Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
+       Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
     MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
     unsigned OpNo;//TODO: find the OpNo for this operand;
     RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
     for (signed RegNo = Interval.first; RegNo < Interval.second;
-	 ++RegNo) {
+    ++RegNo) {
       setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
     }
 #endif
@@ -642,7 +661,6 @@ void BlockWaitcntBrackets::print(raw_ostream &OS) {
     OS << '\n';
   }
   OS << '\n';
-  return;
 }
 
 unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
@@ -860,7 +878,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
       switch (src_type) {
         case SCMEM_LDS:
           if (group_is_multi_wave ||
-	      context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
+            context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
             EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
                                ScoreBrackets->getScoreUB(LGKM_CNT));
             // LDS may have to wait for VM_CNT after buffer load to LDS
@@ -874,9 +892,9 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
         case SCMEM_GDS:
           if (group_is_multi_wave || fence_is_global) {
             EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
-			       ScoreBrackets->getScoreUB(EXP_CNT));
+              ScoreBrackets->getScoreUB(EXP_CNT));
             EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
-			       ScoreBrackets->getScoreUB(LGKM_CNT));
+              ScoreBrackets->getScoreUB(LGKM_CNT));
           }
           break;
 
@@ -886,9 +904,9 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
         case SCMEM_SCATTER:
           if (group_is_multi_wave || fence_is_global) {
             EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
-			       ScoreBrackets->getScoreUB(EXP_CNT));
+              ScoreBrackets->getScoreUB(EXP_CNT));
             EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
-			       ScoreBrackets->getScoreUB(VM_CNT));
+              ScoreBrackets->getScoreUB(VM_CNT));
           }
           break;
 
@@ -927,13 +945,14 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
     // before the call.
     if (MI.getOpcode() == SC_CALL) {
       if (ScoreBrackets->getScoreUB(EXP_CNT) >
-	  ScoreBrackets->getScoreLB(EXP_CNT)) {
+        ScoreBrackets->getScoreLB(EXP_CNT)) {
         ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
         EmitSwaitcnt |= CNT_MASK(EXP_CNT);
       }
     }
 #endif
 
+    // FIXME: Should not be relying on memoperands.
     // Look at the source operands of every instruction to see if
     // any of them results from a previous memory operation that affects
     // its current usage. If so, an s_waitcnt instruction needs to be
@@ -949,6 +968,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
       EmitSwaitcnt |= ScoreBrackets->updateByWait(
           VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
     }
+
     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
       const MachineOperand &Op = MI.getOperand(I);
       const MachineRegisterInfo &MRIA = *MRI;
@@ -973,6 +993,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
     // 2) If a destination operand that was used by a recent export/store ins,
     // add s_waitcnt on exp_cnt to guarantee the WAR order.
     if (MI.mayStore()) {
+      // FIXME: Should not be relying on memoperands.
       for (const MachineMemOperand *Memop : MI.memoperands()) {
         unsigned AS = Memop->getAddrSpace();
         if (AS != AMDGPUASI.LOCAL_ADDRESS)
@@ -1094,7 +1115,8 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
               BlockWaitcntBracketsMap[TBB].get();
           if (!ScoreBracket) {
             assert(BlockVisitedSet.find(TBB) == BlockVisitedSet.end());
-            BlockWaitcntBracketsMap[TBB] = make_unique<BlockWaitcntBrackets>();
+            BlockWaitcntBracketsMap[TBB] =
+                llvm::make_unique<BlockWaitcntBrackets>();
             ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
           }
           ScoreBracket->setRevisitLoop(true);
@@ -1141,8 +1163,21 @@ void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
   } else {
     MBB.push_back(Waitcnt);
   }
+}
+
+// This is a flat memory operation. Check to see if it has memory
+// tokens for both LDS and Memory, and if so mark it as a flat.
+bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
+  if (MI.memoperands_empty())
+    return true;
+
+  for (const MachineMemOperand *Memop : MI.memoperands()) {
+    unsigned AS = Memop->getAddrSpace();
+    if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS)
+      return true;
+  }
 
-  return;
+  return false;
 }
 
 void SIInsertWaitcnts::updateEventWaitCntAfter(
@@ -1151,10 +1186,8 @@ void SIInsertWaitcnts::updateEventWaitCntAfter(
   // instruction, update the upper-bound of the appropriate counter's
   // bracket and the destination operand scores.
   // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
-  uint64_t TSFlags = Inst.getDesc().TSFlags;
-  if (TII->isDS(Inst) && (TSFlags & SIInstrFlags::LGKM_CNT)) {
-    if (TII->getNamedOperand(Inst, AMDGPU::OpName::gds) &&
-	TII->getNamedOperand(Inst, AMDGPU::OpName::gds)->getImm() != 0) {
+  if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
+    if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
       ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
     } else {
@@ -1162,23 +1195,18 @@ void SIInsertWaitcnts::updateEventWaitCntAfter(
     }
   } else if (TII->isFLAT(Inst)) {
     assert(Inst.mayLoad() || Inst.mayStore());
-    ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
-    ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
 
-    // This is a flat memory operation. Check to see if it has memory
-    // tokens for both LDS and Memory, and if so mark it as a flat.
-    bool FoundLDSMem = false;
-    for (const MachineMemOperand *Memop : Inst.memoperands()) {
-      unsigned AS = Memop->getAddrSpace();
-      if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS)
-        FoundLDSMem = true;
-    }
+    if (TII->usesVM_CNT(Inst))
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
+
+    if (TII->usesLGKM_CNT(Inst)) {
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
 
-    // This is a flat memory operation, so note it - it will require
-    // that both the VM and LGKM be flushed to zero if it is pending when
-    // a VM or LGKM dependency occurs.
-    if (FoundLDSMem) {
-      ScoreBrackets->setPendingFlat();
+      // This is a flat memory operation, so note it - it will require
+      // that both the VM and LGKM be flushed to zero if it is pending when
+      // a VM or LGKM dependency occurs.
+      if (mayAccessLDSThroughFlat(Inst))
+        ScoreBrackets->setPendingFlat();
     }
   } else if (SIInstrInfo::isVMEM(Inst) &&
              // TODO: get a better carve out.
@@ -1241,7 +1269,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
         BlockWaitcntBracketsMap[pred].get();
     bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end();
     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
-      break;
+      continue;
     }
     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
          T = (enum InstCounterType)(T + 1)) {
@@ -1280,7 +1308,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
         BlockWaitcntBracketsMap[Pred].get();
     bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end();
     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
-      break;
+      continue;
     }
 
     int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
@@ -1327,7 +1355,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
   // Set the register scoreboard.
   for (MachineBasicBlock *Pred : Block.predecessors()) {
     if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
-      break;
+      continue;
     }
 
     BlockWaitcntBrackets *PredScoreBrackets =
@@ -1441,7 +1469,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
   // the delayed nature of these operations.
   for (MachineBasicBlock *Pred : Block.predecessors()) {
     if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
-      break;
+      continue;
     }
 
     BlockWaitcntBrackets *PredScoreBrackets =
@@ -1494,8 +1522,6 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
     ScoreBrackets->dump();
   });
 
-  bool InsertNOP = false;
-
   // Walk over the instructions.
   for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
        Iter != E;) {
@@ -1555,7 +1581,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
     if (RequireCheckResourceType(Inst, context)) {
       // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
       ScoreBrackets->setScoreLB(VM_CNT,
-				   ScoreBrackets->getScoreUB(VM_CNT));
+      ScoreBrackets->getScoreUB(VM_CNT));
     }
 #endif
 
@@ -1596,58 +1622,6 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
       VCCZBugHandledSet.insert(&Inst);
     }
 
-    if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
-
-      // This avoids a s_nop after a waitcnt has just been inserted.
-      if (!SWaitInst && InsertNOP) {
-        BuildMI(Block, Inst, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
-      }
-      InsertNOP = false;
-
-      // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
-      // or SMEM clause, respectively.
-      //
-      // The temporary workaround is to break the clauses with S_NOP.
-      //
-      // The proper solution would be to allocate registers such that all source
-      // and destination registers don't overlap, e.g. this is illegal:
-      //   r0 = load r2
-      //   r2 = load r0
-      bool IsSMEM = false;
-      bool IsVMEM = false;
-      if (TII->isSMRD(Inst))
-        IsSMEM = true;
-      else if (TII->usesVM_CNT(Inst))
-        IsVMEM = true;
-
-      ++Iter;
-      if (Iter == E)
-        break;
-
-      MachineInstr &Next = *Iter;
-
-      // TODO: How about consecutive SMEM instructions?
-      //       The comments above says break the clause but the code does not.
-      // if ((TII->isSMRD(next) && isSMEM) ||
-      if (!IsSMEM && TII->usesVM_CNT(Next) && IsVMEM &&
-          // TODO: Enable this check when hasSoftClause is upstreamed.
-          // ST->hasSoftClauses() &&
-          ST->isXNACKEnabled()) {
-        // Insert a NOP to break the clause.
-        InsertNOP = true;
-        continue;
-      }
-
-      // There must be "S_NOP 0" between an instruction writing M0 and
-      // S_SENDMSG.
-      if ((Next.getOpcode() == AMDGPU::S_SENDMSG ||
-           Next.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
-          Inst.definesRegister(AMDGPU::M0))
-        InsertNOP = true;
-
-      continue;
-    }
-
     ++Iter;
   }
 
@@ -1752,13 +1726,13 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
 
     BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
     if (!ScoreBrackets) {
-      BlockWaitcntBracketsMap[&MBB] = make_unique<BlockWaitcntBrackets>();
+      BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>();
       ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
     }
     ScoreBrackets->setPostOrder(MBB.getNumber());
     MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
     if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
-      LoopWaitcntDataMap[ContainingLoop] = make_unique<LoopWaitcntData>();
+      LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
 
     // If we are walking into the block from before the loop, then guarantee
     // at least 1 re-walk over the loop to propagate the information, even if
@@ -1819,12 +1793,10 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
 
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
        ++BI) {
-
     MachineBasicBlock &MBB = *BI;
 
     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
          ++I) {
-
       if (!HaveScalarStores && TII->isScalarStore(*I))
         HaveScalarStores = true;
 
@@ -1847,7 +1819,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
 
       for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
            ++I) {
-
         if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
           SeenDCacheWB = true;
         else if (TII->isScalarStore(*I))