1 files changed, 79 insertions, 11 deletions
diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp
index 7dfc31bdfa01c..181b11643bf3e 100644
--- a/lib/Target/R600/SIInsertWaits.cpp
+++ b/lib/Target/R600/SIInsertWaits.cpp
@@ -17,6 +17,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -39,6 +41,12 @@ typedef union {
 
 } Counters;
 
+typedef enum {
+  OTHER,
+  SMEM,
+  VMEM
+} InstType;
+
 typedef Counters RegCounters[512];
 typedef std::pair<unsigned, unsigned> RegInterval;
 
@@ -71,6 +79,9 @@ private:
   /// \brief Different export instruction types seen since last wait.
   unsigned ExpInstrTypesSeen;
 
+  /// \brief Type of the last opcode.
+  InstType LastOpcodeType;
+
   /// \brief Get increment/decrement amount for this instruction.
   Counters getHwCounts(MachineInstr &MI);
 
@@ -81,7 +92,8 @@ private:
   RegInterval getRegInterval(MachineOperand &Op);
 
   /// \brief Handle instructions async components
-  void pushInstruction(MachineInstr &MI);
+  void pushInstruction(MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator I);
 
   /// \brief Insert the actual wait instruction
   bool insertWait(MachineBasicBlock &MBB,
@@ -174,6 +186,29 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
   if (!MI.getDesc().mayStore())
     return false;
 
+  // Check if this operand is the value being stored.
+  // Special case for DS instructions, since the address
+  // operand comes before the value operand and it may have
+  // multiple data operands.
+
+  if (TII->isDS(MI.getOpcode())) {
+    MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data);
+    if (Data && Op.isIdenticalTo(*Data))
+      return true;
+
+    MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
+    if (Data0 && Op.isIdenticalTo(*Data0))
+      return true;
+
+    MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
+    if (Data1 && Op.isIdenticalTo(*Data1))
+      return true;
+
+    return false;
+  }
+
+  // NOTE: This assumes that the value operand is before the
+  // address operand, and that there is only one value operand.
   for (MachineInstr::mop_iterator I = MI.operands_begin(),
        E = MI.operands_end(); I != E; ++I) {
 
@@ -201,10 +236,11 @@ RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) {
   return Result;
 }
 
-void SIInsertWaits::pushInstruction(MachineInstr &MI) {
+void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I) {
 
   // Get the hardware counter increments and sum them up
-  Counters Increment = getHwCounts(MI);
+  Counters Increment = getHwCounts(*I);
   unsigned Sum = 0;
 
   for (unsigned i = 0; i < 3; ++i) {
@@ -213,17 +249,42 @@ void SIInsertWaits::pushInstruction(MachineInstr &MI) {
   }
 
   // If we don't increase anything then that's it
-  if (Sum == 0)
+  if (Sum == 0) {
+    LastOpcodeType = OTHER;
     return;
+  }
+
+  if (TRI->ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    // Any occurence of consecutive VMEM or SMEM instructions forms a VMEM
+    // or SMEM clause, respectively.
+    //
+    // The temporary workaround is to break the clauses with S_NOP.
+    //
+    // The proper solution would be to allocate registers such that all source
+    // and destination registers don't overlap, e.g. this is illegal:
+    //   r0 = load r2
+    //   r2 = load r0
+    if ((LastOpcodeType == SMEM && TII->isSMRD(I->getOpcode())) ||
+        (LastOpcodeType == VMEM && Increment.Named.VM)) {
+      // Insert a NOP to break the clause.
+      BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
+          .addImm(0);
+    }
+
+    if (TII->isSMRD(I->getOpcode()))
+      LastOpcodeType = SMEM;
+    else if (Increment.Named.VM)
+      LastOpcodeType = VMEM;
+  }
 
   // Remember which export instructions we have seen
   if (Increment.Named.EXP) {
-    ExpInstrTypesSeen |= MI.getOpcode() == AMDGPU::EXP ? 1 : 2;
+    ExpInstrTypesSeen |= I->getOpcode() == AMDGPU::EXP ? 1 : 2;
   }
 
-  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
 
-    MachineOperand &Op = MI.getOperand(i);
+    MachineOperand &Op = I->getOperand(i);
     if (!isOpRelevant(Op))
       continue;
 
@@ -300,6 +361,7 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
                   ((Counts.Named.EXP & 0x7) << 4) |
                   ((Counts.Named.LGKM & 0x7) << 8));
 
+  LastOpcodeType = OTHER;
   return true;
 }
 
@@ -346,13 +408,15 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
 bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
   bool Changes = false;
 
-  TII = static_cast<const SIInstrInfo*>(MF.getTarget().getInstrInfo());
-  TRI = static_cast<const SIRegisterInfo*>(MF.getTarget().getRegisterInfo());
+  TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  TRI =
+      static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
 
   MRI = &MF.getRegInfo();
 
   WaitedOn = ZeroCounts;
   LastIssued = ZeroCounts;
+  LastOpcodeType = OTHER;
 
   memset(&UsedRegs, 0, sizeof(UsedRegs));
   memset(&DefinedRegs, 0, sizeof(DefinedRegs));
@@ -364,8 +428,12 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
          I != E; ++I) {
 
-      Changes |= insertWait(MBB, I, handleOperands(*I));
-      pushInstruction(*I);
+      // Wait for everything before a barrier.
+      if (I->getOpcode() == AMDGPU::S_BARRIER)
+        Changes |= insertWait(MBB, I, LastIssued);
+      else
+        Changes |= insertWait(MBB, I, handleOperands(*I));
+      pushInstruction(MBB, I);
     }
 
     // Wait for everything at the end of the MBB