aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp457
1 files changed, 457 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
new file mode 100644
index 000000000000..c9cdbc89f3a4
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -0,0 +1,457 @@
+//===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Insert s_delay_alu instructions to avoid stalls on GFX11+.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/SetVector.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-insert-delay-alu"
+
+namespace {
+
+class AMDGPUInsertDelayAlu : public MachineFunctionPass {
+public:
+ static char ID;
+
+ const SIInstrInfo *SII;
+ const TargetRegisterInfo *TRI;
+
+ TargetSchedModel SchedModel;
+
+ AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ // Return true if MI waits for all outstanding VALU instructions to complete.
+ static bool instructionWaitsForVALU(const MachineInstr &MI) {
+ // These instruction types wait for VA_VDST==0 before issuing.
+ const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP |
+ SIInstrFlags::FLAT | SIInstrFlags::MIMG |
+ SIInstrFlags::MTBUF | SIInstrFlags::MUBUF;
+ if (MI.getDesc().TSFlags & VA_VDST_0)
+ return true;
+ if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
+ MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
+ return true;
+ if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
+ (MI.getOperand(0).getImm() & 0xf000) == 0)
+ return true;
+ return false;
+ }
+
+ // Types of delay that can be encoded in an s_delay_alu instruction.
+ enum DelayType { VALU, TRANS, SALU, OTHER };
+
+ // Get the delay type for an instruction with the specified TSFlags.
+ static DelayType getDelayType(uint64_t TSFlags) {
+ if (TSFlags & SIInstrFlags::TRANS)
+ return TRANS;
+ if (TSFlags & SIInstrFlags::VALU)
+ return VALU;
+ if (TSFlags & SIInstrFlags::SALU)
+ return SALU;
+ return OTHER;
+ }
+
+ // Information about the last instruction(s) that wrote to a particular
+ // regunit. In straight-line code there will only be one such instruction, but
+ // when control flow converges we merge the delay information from each path
+ // to represent the union of the worst-case delays of each type.
+ struct DelayInfo {
+ // One larger than the maximum number of (non-TRANS) VALU instructions we
+ // can encode in an s_delay_alu instruction.
+ static const unsigned VALU_MAX = 5;
+
+ // One larger than the maximum number of TRANS instructions we can encode in
+ // an s_delay_alu instruction.
+ static const unsigned TRANS_MAX = 4;
+
+ // If it was written by a (non-TRANS) VALU, remember how many clock cycles
+ // are left until it completes, and how many other (non-TRANS) VALU we have
+ // seen since it was issued.
+ uint8_t VALUCycles = 0;
+ uint8_t VALUNum = VALU_MAX;
+
+ // If it was written by a TRANS, remember how many clock cycles are left
+ // until it completes, and how many other TRANS we have seen since it was
+ // issued.
+ uint8_t TRANSCycles = 0;
+ uint8_t TRANSNum = TRANS_MAX;
+ // Also remember how many other (non-TRANS) VALU we have seen since it was
+ // issued. When an instruction depends on both a prior TRANS and a prior
+ // non-TRANS VALU, this is used to decide whether to encode a wait for just
+ // one or both of them.
+ uint8_t TRANSNumVALU = VALU_MAX;
+
+ // If it was written by an SALU, remember how many clock cycles are left
+ // until it completes.
+ uint8_t SALUCycles = 0;
+
+ DelayInfo() = default;
+
+ DelayInfo(DelayType Type, unsigned Cycles) {
+ switch (Type) {
+ default:
+ llvm_unreachable("unexpected type");
+ case VALU:
+ VALUCycles = Cycles;
+ VALUNum = 0;
+ break;
+ case TRANS:
+ TRANSCycles = Cycles;
+ TRANSNum = 0;
+ TRANSNumVALU = 0;
+ break;
+ case SALU:
+ SALUCycles = Cycles;
+ break;
+ }
+ }
+
+ bool operator==(const DelayInfo &RHS) const {
+ return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
+ TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
+ TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
+ }
+
+ bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }
+
+ // Merge another DelayInfo into this one, to represent the union of the
+ // worst-case delays of each type.
+ void merge(const DelayInfo &RHS) {
+ VALUCycles = std::max(VALUCycles, RHS.VALUCycles);
+ VALUNum = std::min(VALUNum, RHS.VALUNum);
+ TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles);
+ TRANSNum = std::min(TRANSNum, RHS.TRANSNum);
+ TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU);
+ SALUCycles = std::max(SALUCycles, RHS.SALUCycles);
+ }
+
+ // Update this DelayInfo after issuing an instruction. IsVALU should be 1
+ // when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing
+ // a TRANS, else 0. Cycles is the number of cycles it takes to issue the
+ // instruction. Return true if there is no longer any useful delay info.
+ bool advance(DelayType Type, unsigned Cycles) {
+ bool Erase = true;
+
+ VALUNum += (Type == VALU);
+ if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
+ // Forget about the VALU instruction. It was too far back or has
+ // definitely completed by now.
+ VALUNum = VALU_MAX;
+ VALUCycles = 0;
+ } else {
+ VALUCycles -= Cycles;
+ Erase = false;
+ }
+
+ TRANSNum += (Type == TRANS);
+ TRANSNumVALU += (Type == VALU);
+ if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
+ // Forget about any TRANS instruction. It was too far back or has
+ // definitely completed by now.
+ TRANSNum = TRANS_MAX;
+ TRANSNumVALU = VALU_MAX;
+ TRANSCycles = 0;
+ } else {
+ TRANSCycles -= Cycles;
+ Erase = false;
+ }
+
+ if (SALUCycles <= Cycles) {
+ // Forget about any SALU instruction. It has definitely completed by
+ // now.
+ SALUCycles = 0;
+ } else {
+ SALUCycles -= Cycles;
+ Erase = false;
+ }
+
+ return Erase;
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void dump() const {
+ if (VALUCycles)
+ dbgs() << " VALUCycles=" << (int)VALUCycles;
+ if (VALUNum < VALU_MAX)
+ dbgs() << " VALUNum=" << (int)VALUNum;
+ if (TRANSCycles)
+ dbgs() << " TRANSCycles=" << (int)TRANSCycles;
+ if (TRANSNum < TRANS_MAX)
+ dbgs() << " TRANSNum=" << (int)TRANSNum;
+ if (TRANSNumVALU < VALU_MAX)
+ dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
+ if (SALUCycles)
+ dbgs() << " SALUCycles=" << (int)SALUCycles;
+ }
+#endif
+ };
+
+ // A map from regunits to the delay info for that regunit.
+ struct DelayState : DenseMap<unsigned, DelayInfo> {
+ // Merge another DelayState into this one by merging the delay info for each
+ // regunit.
+ void merge(const DelayState &RHS) {
+ for (const auto &KV : RHS) {
+ iterator It;
+ bool Inserted;
+ std::tie(It, Inserted) = insert(KV);
+ if (!Inserted)
+ It->second.merge(KV.second);
+ }
+ }
+
+ // Advance the delay info for each regunit, erasing any that are no longer
+ // useful.
+ void advance(DelayType Type, unsigned Cycles) {
+ iterator Next;
+ for (auto I = begin(), E = end(); I != E; I = Next) {
+ Next = std::next(I);
+ if (I->second.advance(Type, Cycles))
+ erase(I);
+ }
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void dump(const TargetRegisterInfo *TRI) const {
+ if (empty()) {
+ dbgs() << " empty\n";
+ return;
+ }
+
+ // Dump DelayInfo for each RegUnit in numerical order.
+ SmallVector<const_iterator, 8> Order;
+ Order.reserve(size());
+ for (const_iterator I = begin(), E = end(); I != E; ++I)
+ Order.push_back(I);
+ llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) {
+ return A->first < B->first;
+ });
+ for (const_iterator I : Order) {
+ dbgs() << " " << printRegUnit(I->first, TRI);
+ I->second.dump();
+ dbgs() << "\n";
+ }
+ }
+#endif
+ };
+
+ // The saved delay state at the end of each basic block.
+ DenseMap<MachineBasicBlock *, DelayState> BlockState;
+
+ // Emit an s_delay_alu instruction if necessary before MI.
+ MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,
+ MachineInstr *LastDelayAlu) {
+ unsigned Imm = 0;
+
+ // Wait for a TRANS instruction.
+ if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
+ Imm |= 4 + Delay.TRANSNum;
+
+ // Wait for a VALU instruction (if it's more recent than any TRANS
+ // instruction that we're also waiting for).
+ if (Delay.VALUNum < DelayInfo::VALU_MAX &&
+ Delay.VALUNum <= Delay.TRANSNumVALU) {
+ if (Imm & 0xf)
+ Imm |= Delay.VALUNum << 7;
+ else
+ Imm |= Delay.VALUNum;
+ }
+
+ // Wait for an SALU instruction.
+ if (Delay.SALUCycles) {
+ if (Imm & 0x780) {
+ // We have already encoded a VALU and a TRANS delay. There's no room in
+ // the encoding for an SALU delay as well, so just drop it.
+ } else if (Imm & 0xf) {
+ Imm |= (Delay.SALUCycles + 8) << 7;
+ } else {
+ Imm |= Delay.SALUCycles + 8;
+ }
+ }
+
+ // Don't emit the s_delay_alu instruction if there's nothing to wait for.
+ if (!Imm)
+ return LastDelayAlu;
+
+ // If we only need to wait for one instruction, try encoding it in the last
+ // s_delay_alu that we emitted.
+ if (!(Imm & 0x780) && LastDelayAlu) {
+ unsigned Skip = 0;
+ for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu),
+ E = MachineBasicBlock::instr_iterator(MI);
+ ++I != E;) {
+ if (!I->isBundle() && !I->isMetaInstruction())
+ ++Skip;
+ }
+ if (Skip < 6) {
+ MachineOperand &Op = LastDelayAlu->getOperand(0);
+ unsigned LastImm = Op.getImm();
+ assert((LastImm & ~0xf) == 0 &&
+ "Remembered an s_delay_alu with no room for another delay!");
+ LastImm |= Imm << 7 | Skip << 4;
+ Op.setImm(LastImm);
+ return nullptr;
+ }
+ }
+
+ auto &MBB = *MI.getParent();
+ MachineInstr *DelayAlu =
+ BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm);
+ // Remember the s_delay_alu for next time if there is still room in it to
+ // encode another delay.
+ return (Imm & 0x780) ? nullptr : DelayAlu;
+ }
+
+ bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
+ DelayState State;
+ for (auto *Pred : MBB.predecessors())
+ State.merge(BlockState[Pred]);
+
+ LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB)
+ << "\n";
+ State.dump(TRI););
+
+ bool Changed = false;
+ MachineInstr *LastDelayAlu = nullptr;
+
+ // Iterate over the contents of bundles, but don't emit any instructions
+ // inside a bundle.
+ for (auto &MI : MBB.instrs()) {
+ if (MI.isBundle() || MI.isMetaInstruction())
+ continue;
+
+ // Ignore some more instructions that do not generate any code.
+ switch (MI.getOpcode()) {
+ case AMDGPU::SI_RETURN_TO_EPILOG:
+ continue;
+ }
+
+ DelayType Type = getDelayType(MI.getDesc().TSFlags);
+
+ if (instructionWaitsForVALU(MI)) {
+ // Forget about all outstanding VALU delays.
+ State = DelayState();
+ } else if (Type != OTHER) {
+ DelayInfo Delay;
+ // TODO: Scan implicit uses too?
+ for (const auto &Op : MI.explicit_uses()) {
+ if (Op.isReg()) {
+ // One of the operands of the writelane is also the output operand.
+ // This creates the insertion of redundant delays. Hence, we have to
+ // ignore this operand.
+ if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
+ continue;
+ for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) {
+ auto It = State.find(*UI);
+ if (It != State.end()) {
+ Delay.merge(It->second);
+ State.erase(*UI);
+ }
+ }
+ }
+ }
+ if (Emit && !MI.isBundledWithPred()) {
+ // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
+ // just ignore them?
+ LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
+ }
+ }
+
+ if (Type != OTHER) {
+ // TODO: Scan implicit defs too?
+ for (const auto &Op : MI.defs()) {
+ unsigned Latency = SchedModel.computeOperandLatency(
+ &MI, MI.getOperandNo(&Op), nullptr, 0);
+ for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI)
+ State[*UI] = DelayInfo(Type, Latency);
+ }
+ }
+
+ // Advance by the number of cycles it takes to issue this instruction.
+ // TODO: Use a more advanced model that accounts for instructions that
+ // take multiple cycles to issue on a particular pipeline.
+ unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);
+ // TODO: In wave64 mode, double the number of cycles for VALU and VMEM
+ // instructions on the assumption that they will usually have to be issued
+ // twice?
+ State.advance(Type, Cycles);
+
+ LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI););
+ }
+
+ if (Emit) {
+ assert(State == BlockState[&MBB] &&
+ "Basic block state should not have changed on final pass!");
+ } else if (State != BlockState[&MBB]) {
+ BlockState[&MBB] = std::move(State);
+ Changed = true;
+ }
+ return Changed;
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
+ << "\n");
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (!ST.hasDelayAlu())
+ return false;
+
+ SII = ST.getInstrInfo();
+ TRI = ST.getRegisterInfo();
+
+ SchedModel.init(&ST);
+
+ // Calculate the delay state for each basic block, iterating until we reach
+ // a fixed point.
+ SetVector<MachineBasicBlock *> WorkList;
+ for (auto &MBB : reverse(MF))
+ WorkList.insert(&MBB);
+ while (!WorkList.empty()) {
+ auto &MBB = *WorkList.pop_back_val();
+ bool Changed = runOnMachineBasicBlock(MBB, false);
+ if (Changed)
+ WorkList.insert(MBB.succ_begin(), MBB.succ_end());
+ }
+
+ LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");
+
+ // Make one last pass over all basic blocks to emit s_delay_alu
+ // instructions.
+ bool Changed = false;
+ for (auto &MBB : MF)
+ Changed |= runOnMachineBasicBlock(MBB, true);
+ return Changed;
+ }
+};
+
+} // namespace
+
+char AMDGPUInsertDelayAlu::ID = 0;
+
+char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID;
+
+INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU",
+ false, false)