1 files changed, 457 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
new file mode 100644
index 000000000000..c9cdbc89f3a4
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -0,0 +1,457 @@
+//===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Insert s_delay_alu instructions to avoid stalls on GFX11+.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/SetVector.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-insert-delay-alu"
+
+namespace {
+
+class AMDGPUInsertDelayAlu : public MachineFunctionPass {
+public:
+  static char ID;
+
+  const SIInstrInfo *SII;
+  const TargetRegisterInfo *TRI;
+
+  TargetSchedModel SchedModel;
+
+  AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  // Return true if MI waits for all outstanding VALU instructions to complete.
+  static bool instructionWaitsForVALU(const MachineInstr &MI) {
+    // These instruction types wait for VA_VDST==0 before issuing.
+    const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP |
+                               SIInstrFlags::FLAT | SIInstrFlags::MIMG |
+                               SIInstrFlags::MTBUF | SIInstrFlags::MUBUF;
+    if (MI.getDesc().TSFlags & VA_VDST_0)
+      return true;
+    if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
+        MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
+      return true;
+    if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
+        (MI.getOperand(0).getImm() & 0xf000) == 0)
+      return true;
+    return false;
+  }
+
+  // Types of delay that can be encoded in an s_delay_alu instruction.
+  enum DelayType { VALU, TRANS, SALU, OTHER };
+
+  // Get the delay type for an instruction with the specified TSFlags.
+  static DelayType getDelayType(uint64_t TSFlags) {
+    if (TSFlags & SIInstrFlags::TRANS)
+      return TRANS;
+    if (TSFlags & SIInstrFlags::VALU)
+      return VALU;
+    if (TSFlags & SIInstrFlags::SALU)
+      return SALU;
+    return OTHER;
+  }
+
+  // Information about the last instruction(s) that wrote to a particular
+  // regunit. In straight-line code there will only be one such instruction, but
+  // when control flow converges we merge the delay information from each path
+  // to represent the union of the worst-case delays of each type.
+  struct DelayInfo {
+    // One larger than the maximum number of (non-TRANS) VALU instructions we
+    // can encode in an s_delay_alu instruction.
+    static const unsigned VALU_MAX = 5;
+
+    // One larger than the maximum number of TRANS instructions we can encode in
+    // an s_delay_alu instruction.
+    static const unsigned TRANS_MAX = 4;
+
+    // If it was written by a (non-TRANS) VALU, remember how many clock cycles
+    // are left until it completes, and how many other (non-TRANS) VALU we have
+    // seen since it was issued.
+    uint8_t VALUCycles = 0;
+    uint8_t VALUNum = VALU_MAX;
+
+    // If it was written by a TRANS, remember how many clock cycles are left
+    // until it completes, and how many other TRANS we have seen since it was
+    // issued.
+    uint8_t TRANSCycles = 0;
+    uint8_t TRANSNum = TRANS_MAX;
+    // Also remember how many other (non-TRANS) VALU we have seen since it was
+    // issued. When an instruction depends on both a prior TRANS and a prior
+    // non-TRANS VALU, this is used to decide whether to encode a wait for just
+    // one or both of them.
+    uint8_t TRANSNumVALU = VALU_MAX;
+
+    // If it was written by an SALU, remember how many clock cycles are left
+    // until it completes.
+    uint8_t SALUCycles = 0;
+
+    DelayInfo() = default;
+
+    DelayInfo(DelayType Type, unsigned Cycles) {
+      switch (Type) {
+      default:
+        llvm_unreachable("unexpected type");
+      case VALU:
+        VALUCycles = Cycles;
+        VALUNum = 0;
+        break;
+      case TRANS:
+        TRANSCycles = Cycles;
+        TRANSNum = 0;
+        TRANSNumVALU = 0;
+        break;
+      case SALU:
+        SALUCycles = Cycles;
+        break;
+      }
+    }
+
+    bool operator==(const DelayInfo &RHS) const {
+      return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
+             TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
+             TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
+    }
+
+    bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }
+
+    // Merge another DelayInfo into this one, to represent the union of the
+    // worst-case delays of each type.
+    void merge(const DelayInfo &RHS) {
+      VALUCycles = std::max(VALUCycles, RHS.VALUCycles);
+      VALUNum = std::min(VALUNum, RHS.VALUNum);
+      TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles);
+      TRANSNum = std::min(TRANSNum, RHS.TRANSNum);
+      TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU);
+      SALUCycles = std::max(SALUCycles, RHS.SALUCycles);
+    }
+
+    // Update this DelayInfo after issuing an instruction. IsVALU should be 1
+    // when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing
+    // a TRANS, else 0. Cycles is the number of cycles it takes to issue the
+    // instruction.  Return true if there is no longer any useful delay info.
+    bool advance(DelayType Type, unsigned Cycles) {
+      bool Erase = true;
+
+      VALUNum += (Type == VALU);
+      if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
+        // Forget about the VALU instruction. It was too far back or has
+        // definitely completed by now.
+        VALUNum = VALU_MAX;
+        VALUCycles = 0;
+      } else {
+        VALUCycles -= Cycles;
+        Erase = false;
+      }
+
+      TRANSNum += (Type == TRANS);
+      TRANSNumVALU += (Type == VALU);
+      if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
+        // Forget about any TRANS instruction. It was too far back or has
+        // definitely completed by now.
+        TRANSNum = TRANS_MAX;
+        TRANSNumVALU = VALU_MAX;
+        TRANSCycles = 0;
+      } else {
+        TRANSCycles -= Cycles;
+        Erase = false;
+      }
+
+      if (SALUCycles <= Cycles) {
+        // Forget about any SALU instruction. It has definitely completed by
+        // now.
+        SALUCycles = 0;
+      } else {
+        SALUCycles -= Cycles;
+        Erase = false;
+      }
+
+      return Erase;
+    }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    void dump() const {
+      if (VALUCycles)
+        dbgs() << " VALUCycles=" << (int)VALUCycles;
+      if (VALUNum < VALU_MAX)
+        dbgs() << " VALUNum=" << (int)VALUNum;
+      if (TRANSCycles)
+        dbgs() << " TRANSCycles=" << (int)TRANSCycles;
+      if (TRANSNum < TRANS_MAX)
+        dbgs() << " TRANSNum=" << (int)TRANSNum;
+      if (TRANSNumVALU < VALU_MAX)
+        dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
+      if (SALUCycles)
+        dbgs() << " SALUCycles=" << (int)SALUCycles;
+    }
+#endif
+  };
+
+  // A map from regunits to the delay info for that regunit.
+  struct DelayState : DenseMap<unsigned, DelayInfo> {
+    // Merge another DelayState into this one by merging the delay info for each
+    // regunit.
+    void merge(const DelayState &RHS) {
+      for (const auto &KV : RHS) {
+        iterator It;
+        bool Inserted;
+        std::tie(It, Inserted) = insert(KV);
+        if (!Inserted)
+          It->second.merge(KV.second);
+      }
+    }
+
+    // Advance the delay info for each regunit, erasing any that are no longer
+    // useful.
+    void advance(DelayType Type, unsigned Cycles) {
+      iterator Next;
+      for (auto I = begin(), E = end(); I != E; I = Next) {
+        Next = std::next(I);
+        if (I->second.advance(Type, Cycles))
+          erase(I);
+      }
+    }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    void dump(const TargetRegisterInfo *TRI) const {
+      if (empty()) {
+        dbgs() << "    empty\n";
+        return;
+      }
+
+      // Dump DelayInfo for each RegUnit in numerical order.
+      SmallVector<const_iterator, 8> Order;
+      Order.reserve(size());
+      for (const_iterator I = begin(), E = end(); I != E; ++I)
+        Order.push_back(I);
+      llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) {
+        return A->first < B->first;
+      });
+      for (const_iterator I : Order) {
+        dbgs() << "    " << printRegUnit(I->first, TRI);
+        I->second.dump();
+        dbgs() << "\n";
+      }
+    }
+#endif
+  };
+
+  // The saved delay state at the end of each basic block.
+  DenseMap<MachineBasicBlock *, DelayState> BlockState;
+
+  // Emit an s_delay_alu instruction if necessary before MI.
+  MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,
+                             MachineInstr *LastDelayAlu) {
+    unsigned Imm = 0;
+
+    // Wait for a TRANS instruction.
+    if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
+      Imm |= 4 + Delay.TRANSNum;
+
+    // Wait for a VALU instruction (if it's more recent than any TRANS
+    // instruction that we're also waiting for).
+    if (Delay.VALUNum < DelayInfo::VALU_MAX &&
+        Delay.VALUNum <= Delay.TRANSNumVALU) {
+      if (Imm & 0xf)
+        Imm |= Delay.VALUNum << 7;
+      else
+        Imm |= Delay.VALUNum;
+    }
+
+    // Wait for an SALU instruction.
+    if (Delay.SALUCycles) {
+      if (Imm & 0x780) {
+        // We have already encoded a VALU and a TRANS delay. There's no room in
+        // the encoding for an SALU delay as well, so just drop it.
+      } else if (Imm & 0xf) {
+        Imm |= (Delay.SALUCycles + 8) << 7;
+      } else {
+        Imm |= Delay.SALUCycles + 8;
+      }
+    }
+
+    // Don't emit the s_delay_alu instruction if there's nothing to wait for.
+    if (!Imm)
+      return LastDelayAlu;
+
+    // If we only need to wait for one instruction, try encoding it in the last
+    // s_delay_alu that we emitted.
+    if (!(Imm & 0x780) && LastDelayAlu) {
+      unsigned Skip = 0;
+      for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu),
+                E = MachineBasicBlock::instr_iterator(MI);
+           ++I != E;) {
+        if (!I->isBundle() && !I->isMetaInstruction())
+          ++Skip;
+      }
+      if (Skip < 6) {
+        MachineOperand &Op = LastDelayAlu->getOperand(0);
+        unsigned LastImm = Op.getImm();
+        assert((LastImm & ~0xf) == 0 &&
+               "Remembered an s_delay_alu with no room for another delay!");
+        LastImm |= Imm << 7 | Skip << 4;
+        Op.setImm(LastImm);
+        return nullptr;
+      }
+    }
+
+    auto &MBB = *MI.getParent();
+    MachineInstr *DelayAlu =
+        BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm);
+    // Remember the s_delay_alu for next time if there is still room in it to
+    // encode another delay.
+    return (Imm & 0x780) ? nullptr : DelayAlu;
+  }
+
+  bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
+    DelayState State;
+    for (auto *Pred : MBB.predecessors())
+      State.merge(BlockState[Pred]);
+
+    LLVM_DEBUG(dbgs() << "  State at start of " << printMBBReference(MBB)
+                      << "\n";
+               State.dump(TRI););
+
+    bool Changed = false;
+    MachineInstr *LastDelayAlu = nullptr;
+
+    // Iterate over the contents of bundles, but don't emit any instructions
+    // inside a bundle.
+    for (auto &MI : MBB.instrs()) {
+      if (MI.isBundle() || MI.isMetaInstruction())
+        continue;
+
+      // Ignore some more instructions that do not generate any code.
+      switch (MI.getOpcode()) {
+      case AMDGPU::SI_RETURN_TO_EPILOG:
+        continue;
+      }
+
+      DelayType Type = getDelayType(MI.getDesc().TSFlags);
+
+      if (instructionWaitsForVALU(MI)) {
+        // Forget about all outstanding VALU delays.
+        State = DelayState();
+      } else if (Type != OTHER) {
+        DelayInfo Delay;
+        // TODO: Scan implicit uses too?
+        for (const auto &Op : MI.explicit_uses()) {
+          if (Op.isReg()) {
+            // One of the operands of the writelane is also the output operand.
+            // This creates the insertion of redundant delays. Hence, we have to
+            // ignore this operand.
+            if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
+              continue;
+            for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) {
+              auto It = State.find(*UI);
+              if (It != State.end()) {
+                Delay.merge(It->second);
+                State.erase(*UI);
+              }
+            }
+          }
+        }
+        if (Emit && !MI.isBundledWithPred()) {
+          // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
+          // just ignore them?
+          LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
+        }
+      }
+
+      if (Type != OTHER) {
+        // TODO: Scan implicit defs too?
+        for (const auto &Op : MI.defs()) {
+          unsigned Latency = SchedModel.computeOperandLatency(
+              &MI, MI.getOperandNo(&Op), nullptr, 0);
+          for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI)
+            State[*UI] = DelayInfo(Type, Latency);
+        }
+      }
+
+      // Advance by the number of cycles it takes to issue this instruction.
+      // TODO: Use a more advanced model that accounts for instructions that
+      // take multiple cycles to issue on a particular pipeline.
+      unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);
+      // TODO: In wave64 mode, double the number of cycles for VALU and VMEM
+      // instructions on the assumption that they will usually have to be issued
+      // twice?
+      State.advance(Type, Cycles);
+
+      LLVM_DEBUG(dbgs() << "  State after " << MI; State.dump(TRI););
+    }
+
+    if (Emit) {
+      assert(State == BlockState[&MBB] &&
+             "Basic block state should not have changed on final pass!");
+    } else if (State != BlockState[&MBB]) {
+      BlockState[&MBB] = std::move(State);
+      Changed = true;
+    }
+    return Changed;
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    if (skipFunction(MF.getFunction()))
+      return false;
+
+    LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
+                      << "\n");
+
+    const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+    if (!ST.hasDelayAlu())
+      return false;
+
+    SII = ST.getInstrInfo();
+    TRI = ST.getRegisterInfo();
+
+    SchedModel.init(&ST);
+
+    // Calculate the delay state for each basic block, iterating until we reach
+    // a fixed point.
+    SetVector<MachineBasicBlock *> WorkList;
+    for (auto &MBB : reverse(MF))
+      WorkList.insert(&MBB);
+    while (!WorkList.empty()) {
+      auto &MBB = *WorkList.pop_back_val();
+      bool Changed = runOnMachineBasicBlock(MBB, false);
+      if (Changed)
+        WorkList.insert(MBB.succ_begin(), MBB.succ_end());
+    }
+
+    LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");
+
+    // Make one last pass over all basic blocks to emit s_delay_alu
+    // instructions.
+    bool Changed = false;
+    for (auto &MBB : MF)
+      Changed |= runOnMachineBasicBlock(MBB, true);
+    return Changed;
+  }
+};
+
+} // namespace
+
+char AMDGPUInsertDelayAlu::ID = 0;
+
+char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID;
+
+INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU",
+                false, false)