diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp | 457 |
1 files changed, 457 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp new file mode 100644 index 000000000000..c9cdbc89f3a4 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp @@ -0,0 +1,457 @@ +//===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Insert s_delay_alu instructions to avoid stalls on GFX11+. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/SetVector.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-insert-delay-alu" + +namespace { + +class AMDGPUInsertDelayAlu : public MachineFunctionPass { +public: + static char ID; + + const SIInstrInfo *SII; + const TargetRegisterInfo *TRI; + + TargetSchedModel SchedModel; + + AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + // Return true if MI waits for all outstanding VALU instructions to complete. + static bool instructionWaitsForVALU(const MachineInstr &MI) { + // These instruction types wait for VA_VDST==0 before issuing. + const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP | + SIInstrFlags::FLAT | SIInstrFlags::MIMG | + SIInstrFlags::MTBUF | SIInstrFlags::MUBUF; + if (MI.getDesc().TSFlags & VA_VDST_0) + return true; + if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 || + MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64) + return true; + if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && + (MI.getOperand(0).getImm() & 0xf000) == 0) + return true; + return false; + } + + // Types of delay that can be encoded in an s_delay_alu instruction. + enum DelayType { VALU, TRANS, SALU, OTHER }; + + // Get the delay type for an instruction with the specified TSFlags. + static DelayType getDelayType(uint64_t TSFlags) { + if (TSFlags & SIInstrFlags::TRANS) + return TRANS; + if (TSFlags & SIInstrFlags::VALU) + return VALU; + if (TSFlags & SIInstrFlags::SALU) + return SALU; + return OTHER; + } + + // Information about the last instruction(s) that wrote to a particular + // regunit. In straight-line code there will only be one such instruction, but + // when control flow converges we merge the delay information from each path + // to represent the union of the worst-case delays of each type. + struct DelayInfo { + // One larger than the maximum number of (non-TRANS) VALU instructions we + // can encode in an s_delay_alu instruction. + static const unsigned VALU_MAX = 5; + + // One larger than the maximum number of TRANS instructions we can encode in + // an s_delay_alu instruction. + static const unsigned TRANS_MAX = 4; + + // If it was written by a (non-TRANS) VALU, remember how many clock cycles + // are left until it completes, and how many other (non-TRANS) VALU we have + // seen since it was issued. + uint8_t VALUCycles = 0; + uint8_t VALUNum = VALU_MAX; + + // If it was written by a TRANS, remember how many clock cycles are left + // until it completes, and how many other TRANS we have seen since it was + // issued. + uint8_t TRANSCycles = 0; + uint8_t TRANSNum = TRANS_MAX; + // Also remember how many other (non-TRANS) VALU we have seen since it was + // issued. When an instruction depends on both a prior TRANS and a prior + // non-TRANS VALU, this is used to decide whether to encode a wait for just + // one or both of them. + uint8_t TRANSNumVALU = VALU_MAX; + + // If it was written by an SALU, remember how many clock cycles are left + // until it completes. + uint8_t SALUCycles = 0; + + DelayInfo() = default; + + DelayInfo(DelayType Type, unsigned Cycles) { + switch (Type) { + default: + llvm_unreachable("unexpected type"); + case VALU: + VALUCycles = Cycles; + VALUNum = 0; + break; + case TRANS: + TRANSCycles = Cycles; + TRANSNum = 0; + TRANSNumVALU = 0; + break; + case SALU: + SALUCycles = Cycles; + break; + } + } + + bool operator==(const DelayInfo &RHS) const { + return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum && + TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum && + TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles; + } + + bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); } + + // Merge another DelayInfo into this one, to represent the union of the + // worst-case delays of each type. + void merge(const DelayInfo &RHS) { + VALUCycles = std::max(VALUCycles, RHS.VALUCycles); + VALUNum = std::min(VALUNum, RHS.VALUNum); + TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles); + TRANSNum = std::min(TRANSNum, RHS.TRANSNum); + TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU); + SALUCycles = std::max(SALUCycles, RHS.SALUCycles); + } + + // Update this DelayInfo after issuing an instruction. IsVALU should be 1 + // when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing + // a TRANS, else 0. Cycles is the number of cycles it takes to issue the + // instruction. Return true if there is no longer any useful delay info. + bool advance(DelayType Type, unsigned Cycles) { + bool Erase = true; + + VALUNum += (Type == VALU); + if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) { + // Forget about the VALU instruction. It was too far back or has + // definitely completed by now. + VALUNum = VALU_MAX; + VALUCycles = 0; + } else { + VALUCycles -= Cycles; + Erase = false; + } + + TRANSNum += (Type == TRANS); + TRANSNumVALU += (Type == VALU); + if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) { + // Forget about any TRANS instruction. It was too far back or has + // definitely completed by now. + TRANSNum = TRANS_MAX; + TRANSNumVALU = VALU_MAX; + TRANSCycles = 0; + } else { + TRANSCycles -= Cycles; + Erase = false; + } + + if (SALUCycles <= Cycles) { + // Forget about any SALU instruction. It has definitely completed by + // now. + SALUCycles = 0; + } else { + SALUCycles -= Cycles; + Erase = false; + } + + return Erase; + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump() const { + if (VALUCycles) + dbgs() << " VALUCycles=" << (int)VALUCycles; + if (VALUNum < VALU_MAX) + dbgs() << " VALUNum=" << (int)VALUNum; + if (TRANSCycles) + dbgs() << " TRANSCycles=" << (int)TRANSCycles; + if (TRANSNum < TRANS_MAX) + dbgs() << " TRANSNum=" << (int)TRANSNum; + if (TRANSNumVALU < VALU_MAX) + dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU; + if (SALUCycles) + dbgs() << " SALUCycles=" << (int)SALUCycles; + } +#endif + }; + + // A map from regunits to the delay info for that regunit. + struct DelayState : DenseMap<unsigned, DelayInfo> { + // Merge another DelayState into this one by merging the delay info for each + // regunit. + void merge(const DelayState &RHS) { + for (const auto &KV : RHS) { + iterator It; + bool Inserted; + std::tie(It, Inserted) = insert(KV); + if (!Inserted) + It->second.merge(KV.second); + } + } + + // Advance the delay info for each regunit, erasing any that are no longer + // useful. + void advance(DelayType Type, unsigned Cycles) { + iterator Next; + for (auto I = begin(), E = end(); I != E; I = Next) { + Next = std::next(I); + if (I->second.advance(Type, Cycles)) + erase(I); + } + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump(const TargetRegisterInfo *TRI) const { + if (empty()) { + dbgs() << " empty\n"; + return; + } + + // Dump DelayInfo for each RegUnit in numerical order. + SmallVector<const_iterator, 8> Order; + Order.reserve(size()); + for (const_iterator I = begin(), E = end(); I != E; ++I) + Order.push_back(I); + llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) { + return A->first < B->first; + }); + for (const_iterator I : Order) { + dbgs() << " " << printRegUnit(I->first, TRI); + I->second.dump(); + dbgs() << "\n"; + } + } +#endif + }; + + // The saved delay state at the end of each basic block. + DenseMap<MachineBasicBlock *, DelayState> BlockState; + + // Emit an s_delay_alu instruction if necessary before MI. + MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay, + MachineInstr *LastDelayAlu) { + unsigned Imm = 0; + + // Wait for a TRANS instruction. + if (Delay.TRANSNum < DelayInfo::TRANS_MAX) + Imm |= 4 + Delay.TRANSNum; + + // Wait for a VALU instruction (if it's more recent than any TRANS + // instruction that we're also waiting for). + if (Delay.VALUNum < DelayInfo::VALU_MAX && + Delay.VALUNum <= Delay.TRANSNumVALU) { + if (Imm & 0xf) + Imm |= Delay.VALUNum << 7; + else + Imm |= Delay.VALUNum; + } + + // Wait for an SALU instruction. + if (Delay.SALUCycles) { + if (Imm & 0x780) { + // We have already encoded a VALU and a TRANS delay. There's no room in + // the encoding for an SALU delay as well, so just drop it. + } else if (Imm & 0xf) { + Imm |= (Delay.SALUCycles + 8) << 7; + } else { + Imm |= Delay.SALUCycles + 8; + } + } + + // Don't emit the s_delay_alu instruction if there's nothing to wait for. + if (!Imm) + return LastDelayAlu; + + // If we only need to wait for one instruction, try encoding it in the last + // s_delay_alu that we emitted. + if (!(Imm & 0x780) && LastDelayAlu) { + unsigned Skip = 0; + for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu), + E = MachineBasicBlock::instr_iterator(MI); + ++I != E;) { + if (!I->isBundle() && !I->isMetaInstruction()) + ++Skip; + } + if (Skip < 6) { + MachineOperand &Op = LastDelayAlu->getOperand(0); + unsigned LastImm = Op.getImm(); + assert((LastImm & ~0xf) == 0 && + "Remembered an s_delay_alu with no room for another delay!"); + LastImm |= Imm << 7 | Skip << 4; + Op.setImm(LastImm); + return nullptr; + } + } + + auto &MBB = *MI.getParent(); + MachineInstr *DelayAlu = + BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm); + // Remember the s_delay_alu for next time if there is still room in it to + // encode another delay. + return (Imm & 0x780) ? nullptr : DelayAlu; + } + + bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) { + DelayState State; + for (auto *Pred : MBB.predecessors()) + State.merge(BlockState[Pred]); + + LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB) + << "\n"; + State.dump(TRI);); + + bool Changed = false; + MachineInstr *LastDelayAlu = nullptr; + + // Iterate over the contents of bundles, but don't emit any instructions + // inside a bundle. + for (auto &MI : MBB.instrs()) { + if (MI.isBundle() || MI.isMetaInstruction()) + continue; + + // Ignore some more instructions that do not generate any code. + switch (MI.getOpcode()) { + case AMDGPU::SI_RETURN_TO_EPILOG: + continue; + } + + DelayType Type = getDelayType(MI.getDesc().TSFlags); + + if (instructionWaitsForVALU(MI)) { + // Forget about all outstanding VALU delays. + State = DelayState(); + } else if (Type != OTHER) { + DelayInfo Delay; + // TODO: Scan implicit uses too? + for (const auto &Op : MI.explicit_uses()) { + if (Op.isReg()) { + // One of the operands of the writelane is also the output operand. + // This creates the insertion of redundant delays. Hence, we have to + // ignore this operand. + if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied()) + continue; + for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) { + auto It = State.find(*UI); + if (It != State.end()) { + Delay.merge(It->second); + State.erase(*UI); + } + } + } + } + if (Emit && !MI.isBundledWithPred()) { + // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or + // just ignore them? + LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu); + } + } + + if (Type != OTHER) { + // TODO: Scan implicit defs too? + for (const auto &Op : MI.defs()) { + unsigned Latency = SchedModel.computeOperandLatency( + &MI, MI.getOperandNo(&Op), nullptr, 0); + for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) + State[*UI] = DelayInfo(Type, Latency); + } + } + + // Advance by the number of cycles it takes to issue this instruction. + // TODO: Use a more advanced model that accounts for instructions that + // take multiple cycles to issue on a particular pipeline. + unsigned Cycles = SIInstrInfo::getNumWaitStates(MI); + // TODO: In wave64 mode, double the number of cycles for VALU and VMEM + // instructions on the assumption that they will usually have to be issued + // twice? + State.advance(Type, Cycles); + + LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI);); + } + + if (Emit) { + assert(State == BlockState[&MBB] && + "Basic block state should not have changed on final pass!"); + } else if (State != BlockState[&MBB]) { + BlockState[&MBB] = std::move(State); + Changed = true; + } + return Changed; + } + + bool runOnMachineFunction(MachineFunction &MF) override { + if (skipFunction(MF.getFunction())) + return false; + + LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName() + << "\n"); + + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + if (!ST.hasDelayAlu()) + return false; + + SII = ST.getInstrInfo(); + TRI = ST.getRegisterInfo(); + + SchedModel.init(&ST); + + // Calculate the delay state for each basic block, iterating until we reach + // a fixed point. + SetVector<MachineBasicBlock *> WorkList; + for (auto &MBB : reverse(MF)) + WorkList.insert(&MBB); + while (!WorkList.empty()) { + auto &MBB = *WorkList.pop_back_val(); + bool Changed = runOnMachineBasicBlock(MBB, false); + if (Changed) + WorkList.insert(MBB.succ_begin(), MBB.succ_end()); + } + + LLVM_DEBUG(dbgs() << "Final pass over all BBs\n"); + + // Make one last pass over all basic blocks to emit s_delay_alu + // instructions. + bool Changed = false; + for (auto &MBB : MF) + Changed |= runOnMachineBasicBlock(MBB, true); + return Changed; + } +}; + +} // namespace + +char AMDGPUInsertDelayAlu::ID = 0; + +char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID; + +INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU", + false, false) |