diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 496 |
1 files changed, 259 insertions, 237 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ef662d55cb0a9..2a157eb20ab47 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -32,6 +32,7 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" @@ -57,7 +58,6 @@ #include <cstring> #include <memory> #include <utility> -#include <vector> using namespace llvm; @@ -109,15 +109,13 @@ iterator_range<enum_iterator<InstCounterType>> inst_counter_types() { enum_iterator<InstCounterType>(NUM_INST_CNTS)); } -using RegInterval = std::pair<signed, signed>; +using RegInterval = std::pair<int, int>; struct { - uint32_t VmcntMax; - uint32_t ExpcntMax; - uint32_t LgkmcntMax; - uint32_t VscntMax; - int32_t NumVGPRsMax; - int32_t NumSGPRsMax; + unsigned VmcntMax; + unsigned ExpcntMax; + unsigned LgkmcntMax; + unsigned VscntMax; } HardwareLimits; struct { @@ -143,7 +141,7 @@ enum WaitEventType { NUM_WAIT_EVENTS, }; -static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = { +static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = { (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS), (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) | (1 << SQ_MESSAGE), @@ -166,6 +164,28 @@ enum RegisterMapping { NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts. }; +// Enumerate different types of result-returning VMEM operations. Although +// s_waitcnt orders them all with a single vmcnt counter, in the absence of +// s_waitcnt only instructions of the same VmemType are guaranteed to write +// their results in order -- so there is no need to insert an s_waitcnt between +// two instructions of the same type that write the same vgpr. +enum VmemType { + // BUF instructions and MIMG instructions without a sampler. + VMEM_NOSAMPLER, + // MIMG instructions with a sampler. + VMEM_SAMPLER, +}; + +VmemType getVmemType(const MachineInstr &Inst) { + assert(SIInstrInfo::isVMEM(Inst)); + if (!SIInstrInfo::isMIMG(Inst)) + return VMEM_NOSAMPLER; + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode()); + return AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler + ? VMEM_SAMPLER + : VMEM_NOSAMPLER; +} + void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { switch (T) { case VM_CNT: @@ -195,12 +215,9 @@ void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { // "s_waitcnt 0" before use. class WaitcntBrackets { public: - WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) { - for (auto T : inst_counter_types()) - memset(VgprScores[T], 0, sizeof(VgprScores[T])); - } + WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {} - static uint32_t getWaitCountMax(InstCounterType T) { + static unsigned getWaitCountMax(InstCounterType T) { switch (T) { case VM_CNT: return HardwareLimits.VmcntMax; @@ -216,17 +233,13 @@ public: return 0; } - uint32_t getScoreLB(InstCounterType T) const { + unsigned getScoreLB(InstCounterType T) const { assert(T < NUM_INST_CNTS); - if (T >= NUM_INST_CNTS) - return 0; return ScoreLBs[T]; } - uint32_t getScoreUB(InstCounterType T) const { + unsigned getScoreUB(InstCounterType T) const { assert(T < NUM_INST_CNTS); - if (T >= NUM_INST_CNTS) - return 0; return ScoreUBs[T]; } @@ -242,7 +255,7 @@ public: return EXP_CNT; } - uint32_t getRegScore(int GprNo, InstCounterType T) { + unsigned getRegScore(int GprNo, InstCounterType T) { if (GprNo < NUM_ALL_VGPRS) { return VgprScores[T][GprNo]; } @@ -250,30 +263,16 @@ public: return SgprScores[GprNo - NUM_ALL_VGPRS]; } - void clear() { - memset(ScoreLBs, 0, sizeof(ScoreLBs)); - memset(ScoreUBs, 0, sizeof(ScoreUBs)); - PendingEvents = 0; - memset(MixedPendingEvents, 0, sizeof(MixedPendingEvents)); - for (auto T : inst_counter_types()) - memset(VgprScores[T], 0, sizeof(VgprScores[T])); - memset(SgprScores, 0, sizeof(SgprScores)); - } - bool merge(const WaitcntBrackets &Other); RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII, const MachineRegisterInfo *MRI, - const SIRegisterInfo *TRI, unsigned OpNo, - bool Def) const; - - int32_t getMaxVGPR() const { return VgprUB; } - int32_t getMaxSGPR() const { return SgprUB; } + const SIRegisterInfo *TRI, unsigned OpNo) const; bool counterOutOfOrder(InstCounterType T) const; bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const; - void determineWait(InstCounterType T, uint32_t ScoreToWait, + void determineWait(InstCounterType T, unsigned ScoreToWait, AMDGPU::Waitcnt &Wait) const; void applyWaitcnt(const AMDGPU::Waitcnt &Wait); void applyWaitcnt(InstCounterType T, unsigned Count); @@ -286,6 +285,12 @@ public: return PendingEvents & (1 << E); } + bool hasMixedPendingEvents(InstCounterType T) const { + unsigned Events = PendingEvents & WaitEventMaskForInst[T]; + // Return true if more than one bit is set in Events. + return Events & (Events - 1); + } + bool hasPendingFlat() const { return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] && LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) || @@ -298,71 +303,77 @@ public: LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT]; } + // Return true if there might be pending writes to the specified vgpr by VMEM + // instructions with types different from V. + bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const { + assert(GprNo < NUM_ALL_VGPRS); + return VgprVmemTypes[GprNo] & ~(1 << V); + } + + void clearVgprVmemTypes(int GprNo) { + assert(GprNo < NUM_ALL_VGPRS); + VgprVmemTypes[GprNo] = 0; + } + void print(raw_ostream &); void dump() { print(dbgs()); } private: struct MergeInfo { - uint32_t OldLB; - uint32_t OtherLB; - uint32_t MyShift; - uint32_t OtherShift; + unsigned OldLB; + unsigned OtherLB; + unsigned MyShift; + unsigned OtherShift; }; - static bool mergeScore(const MergeInfo &M, uint32_t &Score, - uint32_t OtherScore); + static bool mergeScore(const MergeInfo &M, unsigned &Score, + unsigned OtherScore); - void setScoreLB(InstCounterType T, uint32_t Val) { + void setScoreLB(InstCounterType T, unsigned Val) { assert(T < NUM_INST_CNTS); - if (T >= NUM_INST_CNTS) - return; ScoreLBs[T] = Val; } - void setScoreUB(InstCounterType T, uint32_t Val) { + void setScoreUB(InstCounterType T, unsigned Val) { assert(T < NUM_INST_CNTS); - if (T >= NUM_INST_CNTS) - return; ScoreUBs[T] = Val; if (T == EXP_CNT) { - uint32_t UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT); + unsigned UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT); if (ScoreLBs[T] < UB && UB < ScoreUBs[T]) ScoreLBs[T] = UB; } } - void setRegScore(int GprNo, InstCounterType T, uint32_t Val) { + void setRegScore(int GprNo, InstCounterType T, unsigned Val) { if (GprNo < NUM_ALL_VGPRS) { - if (GprNo > VgprUB) { - VgprUB = GprNo; - } + VgprUB = std::max(VgprUB, GprNo); VgprScores[T][GprNo] = Val; } else { assert(T == LGKM_CNT); - if (GprNo - NUM_ALL_VGPRS > SgprUB) { - SgprUB = GprNo - NUM_ALL_VGPRS; - } + SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS); SgprScores[GprNo - NUM_ALL_VGPRS] = Val; } } void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII, const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, - unsigned OpNo, uint32_t Val); + unsigned OpNo, unsigned Val); const GCNSubtarget *ST = nullptr; - uint32_t ScoreLBs[NUM_INST_CNTS] = {0}; - uint32_t ScoreUBs[NUM_INST_CNTS] = {0}; - uint32_t PendingEvents = 0; - bool MixedPendingEvents[NUM_INST_CNTS] = {false}; + unsigned ScoreLBs[NUM_INST_CNTS] = {0}; + unsigned ScoreUBs[NUM_INST_CNTS] = {0}; + unsigned PendingEvents = 0; // Remember the last flat memory operation. - uint32_t LastFlat[NUM_INST_CNTS] = {0}; + unsigned LastFlat[NUM_INST_CNTS] = {0}; // wait_cnt scores for every vgpr. // Keep track of the VgprUB and SgprUB to make merge at join efficient. - int32_t VgprUB = 0; - int32_t SgprUB = 0; - uint32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS]; + int VgprUB = -1; + int SgprUB = -1; + unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}}; // Wait cnt scores for every sgpr, only lgkmcnt is relevant. - uint32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0}; + unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0}; + // Bitmask of the VmemTypes of VMEM instructions that might have a pending + // write to each vgpr. + unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; }; class SIInsertWaitcnts : public MachineFunctionPass { @@ -385,8 +396,7 @@ private: explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {} }; - std::vector<BlockInfo> BlockInfos; // by reverse post-order traversal index - DenseMap<MachineBasicBlock *, unsigned> RpotIdxMap; + MapVector<MachineBasicBlock *, BlockInfo> BlockInfos; // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0 // because of amdgpu-waitcnt-forcezero flag @@ -464,10 +474,10 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII, const MachineRegisterInfo *MRI, const SIRegisterInfo *TRI, - unsigned OpNo, bool Def) const { + unsigned OpNo) const { const MachineOperand &Op = MI->getOperand(OpNo); - if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) || - (Def && !Op.isDef()) || TRI->isAGPR(*MRI, Op.getReg())) + assert(Op.isReg()); + if (!TRI->isInAllocatableClass(Op.getReg()) || TRI->isAGPR(*MRI, Op.getReg())) return {-1, -1}; // A use via a PW operand does not need a waitcnt. @@ -475,29 +485,27 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, assert(!Op.getSubReg() || !Op.isUndef()); RegInterval Result; - const MachineRegisterInfo &MRIA = *MRI; unsigned Reg = TRI->getEncodingValue(Op.getReg()); - if (TRI->isVGPR(MRIA, Op.getReg())) { + if (TRI->isVGPR(*MRI, Op.getReg())) { assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL); Result.first = Reg - RegisterEncoding.VGPR0; assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS); - } else if (TRI->isSGPRReg(MRIA, Op.getReg())) { + } else if (TRI->isSGPRReg(*MRI, Op.getReg())) { assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS); Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS; assert(Result.first >= NUM_ALL_VGPRS && Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS); } // TODO: Handle TTMP - // else if (TRI->isTTMP(MRIA, Reg.getReg())) ... + // else if (TRI->isTTMP(*MRI, Reg.getReg())) ... else return {-1, -1}; - const MachineInstr &MIA = *MI; - const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo); + const TargetRegisterClass *RC = TII->getOpRegClass(*MI, OpNo); unsigned Size = TRI->getRegSizeInBits(*RC); - Result.second = Result.first + (Size / 32); + Result.second = Result.first + ((Size + 16) / 32); return Result; } @@ -506,13 +514,10 @@ void WaitcntBrackets::setExpScore(const MachineInstr *MI, const SIInstrInfo *TII, const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, unsigned OpNo, - uint32_t Val) { - RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false); - LLVM_DEBUG({ - const MachineOperand &Opnd = MI->getOperand(OpNo); - assert(TRI->isVGPR(*MRI, Opnd.getReg())); - }); - for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + unsigned Val) { + RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo); + assert(TRI->isVGPR(*MRI, MI->getOperand(OpNo).getReg())); + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { setRegScore(RegNo, EXP_CNT, Val); } } @@ -521,19 +526,14 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, WaitEventType E, MachineInstr &Inst) { - const MachineRegisterInfo &MRIA = *MRI; InstCounterType T = eventCounter(E); - uint32_t CurrScore = getScoreUB(T) + 1; + unsigned CurrScore = getScoreUB(T) + 1; if (CurrScore == 0) report_fatal_error("InsertWaitcnt score wraparound"); // PendingEvents and ScoreUB need to be update regardless if this event // changes the score of a register or not. // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message. - if (!hasPendingEvent(E)) { - if (PendingEvents & WaitEventMaskForInst[T]) - MixedPendingEvents[T] = true; - PendingEvents |= 1 << E; - } + PendingEvents |= 1 << E; setScoreUB(T, CurrScore); if (T == EXP_CNT) { @@ -574,7 +574,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) { for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { const MachineOperand &Op = Inst.getOperand(I); - if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) { + if (Op.isReg() && !Op.isDef() && TRI->isVGPR(*MRI, Op.getReg())) { setExpScore(&Inst, TII, TRI, MRI, I, CurrScore); } } @@ -622,7 +622,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { MachineOperand &DefMO = Inst.getOperand(I); if (DefMO.isReg() && DefMO.isDef() && - TRI->isVGPR(MRIA, DefMO.getReg())) { + TRI->isVGPR(*MRI, DefMO.getReg())) { setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT, CurrScore); } @@ -630,7 +630,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, } for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { MachineOperand &MO = Inst.getOperand(I); - if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) { + if (MO.isReg() && !MO.isDef() && TRI->isVGPR(*MRI, MO.getReg())) { setExpScore(&Inst, TII, TRI, MRI, I, CurrScore); } } @@ -641,8 +641,8 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) { MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data); unsigned OpNo;//TODO: find the OpNo for this operand; - RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false); - for (signed RegNo = Interval.first; RegNo < Interval.second; + RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo); + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore); } @@ -650,10 +650,20 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, } else { // Match the score to the destination registers. for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { - RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true); - if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS) + auto &Op = Inst.getOperand(I); + if (!Op.isReg() || !Op.isDef()) continue; - for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I); + if (T == VM_CNT) { + if (Interval.first >= NUM_ALL_VGPRS) + continue; + if (SIInstrInfo::isVMEM(Inst)) { + VmemType V = getVmemType(Inst); + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) + VgprVmemTypes[RegNo] |= 1 << V; + } + } + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { setRegScore(RegNo, T, CurrScore); } } @@ -666,8 +676,8 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, void WaitcntBrackets::print(raw_ostream &OS) { OS << '\n'; for (auto T : inst_counter_types()) { - uint32_t LB = getScoreLB(T); - uint32_t UB = getScoreUB(T); + unsigned LB = getScoreLB(T); + unsigned UB = getScoreUB(T); switch (T) { case VM_CNT: @@ -689,11 +699,11 @@ void WaitcntBrackets::print(raw_ostream &OS) { if (LB < UB) { // Print vgpr scores. - for (int J = 0; J <= getMaxVGPR(); J++) { - uint32_t RegScore = getRegScore(J, T); + for (int J = 0; J <= VgprUB; J++) { + unsigned RegScore = getRegScore(J, T); if (RegScore <= LB) continue; - uint32_t RelScore = RegScore - LB - 1; + unsigned RelScore = RegScore - LB - 1; if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) { OS << RelScore << ":v" << J << " "; } else { @@ -702,11 +712,11 @@ void WaitcntBrackets::print(raw_ostream &OS) { } // Also need to print sgpr scores for lgkm_cnt. if (T == LGKM_CNT) { - for (int J = 0; J <= getMaxSGPR(); J++) { - uint32_t RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT); + for (int J = 0; J <= SgprUB; J++) { + unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT); if (RegScore <= LB) continue; - uint32_t RelScore = RegScore - LB - 1; + unsigned RelScore = RegScore - LB - 1; OS << RelScore << ":s" << J << " "; } } @@ -727,8 +737,8 @@ bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T, unsigned &Count) const { - const uint32_t LB = getScoreLB(T); - const uint32_t UB = getScoreUB(T); + const unsigned LB = getScoreLB(T); + const unsigned UB = getScoreUB(T); if (Count < UB && UB - Count > LB) return true; @@ -736,12 +746,12 @@ bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T, return false; } -void WaitcntBrackets::determineWait(InstCounterType T, uint32_t ScoreToWait, +void WaitcntBrackets::determineWait(InstCounterType T, unsigned ScoreToWait, AMDGPU::Waitcnt &Wait) const { // If the score of src_operand falls within the bracket, we need an // s_waitcnt instruction. - const uint32_t LB = getScoreLB(T); - const uint32_t UB = getScoreUB(T); + const unsigned LB = getScoreLB(T); + const unsigned UB = getScoreUB(T); if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { if ((T == VM_CNT || T == LGKM_CNT) && hasPendingFlat() && @@ -758,7 +768,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, uint32_t ScoreToWait, } else { // If a counter has been maxed out avoid overflow by waiting for // MAX(CounterType) - 1 instead. - uint32_t NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1); + unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1); addWait(Wait, T, NeededWait); } } @@ -772,7 +782,7 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) { } void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { - const uint32_t UB = getScoreUB(T); + const unsigned UB = getScoreUB(T); if (Count >= UB) return; if (Count != 0) { @@ -781,7 +791,6 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { setScoreLB(T, std::max(getScoreLB(T), UB - Count)); } else { setScoreLB(T, UB); - MixedPendingEvents[T] = false; PendingEvents &= ~WaitEventMaskForInst[T]; } } @@ -792,7 +801,7 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { // Scalar memory read always can go out of order. if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS)) return true; - return MixedPendingEvents[T]; + return hasMixedPendingEvents(T); } INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, @@ -954,10 +963,10 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( int CallAddrOpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); - RegInterval CallAddrOpInterval = ScoreBrackets.getRegInterval( - &MI, TII, MRI, TRI, CallAddrOpIdx, false); + RegInterval CallAddrOpInterval = + ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, CallAddrOpIdx); - for (signed RegNo = CallAddrOpInterval.first; + for (int RegNo = CallAddrOpInterval.first; RegNo < CallAddrOpInterval.second; ++RegNo) ScoreBrackets.determineWait( LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); @@ -965,10 +974,10 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( int RtnAddrOpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); if (RtnAddrOpIdx != -1) { - RegInterval RtnAddrOpInterval = ScoreBrackets.getRegInterval( - &MI, TII, MRI, TRI, RtnAddrOpIdx, false); + RegInterval RtnAddrOpInterval = + ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, RtnAddrOpIdx); - for (signed RegNo = RtnAddrOpInterval.first; + for (int RegNo = RtnAddrOpInterval.first; RegNo < RtnAddrOpInterval.second; ++RegNo) ScoreBrackets.determineWait( LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); @@ -982,7 +991,19 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( // emitted. // If the source operand was defined by a load, add the s_waitcnt // instruction. + // + // Two cases are handled for destination operands: + // 1) If the destination operand was defined by a load, add the s_waitcnt + // instruction to guarantee the right WAW order. + // 2) If a destination operand that was used by a recent export/store ins, + // add s_waitcnt on exp_cnt to guarantee the WAR order. for (const MachineMemOperand *Memop : MI.memoperands()) { + const Value *Ptr = Memop->getValue(); + if (Memop->isStore() && SLoadAddresses.count(Ptr)) { + addWait(Wait, LGKM_CNT, 0); + if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second)) + SLoadAddresses.erase(Ptr); + } unsigned AS = Memop->getAddrSpace(); if (AS != AMDGPUAS::LOCAL_ADDRESS) continue; @@ -990,67 +1011,41 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( // VM_CNT is only relevant to vgpr or LDS. ScoreBrackets.determineWait( VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); - } - - for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { - const MachineOperand &Op = MI.getOperand(I); - const MachineRegisterInfo &MRIA = *MRI; - RegInterval Interval = - ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false); - for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - if (TRI->isVGPR(MRIA, Op.getReg())) { - // VM_CNT is only relevant to vgpr or LDS. - ScoreBrackets.determineWait( - VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); - } - ScoreBrackets.determineWait( - LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); - } - } - // End of for loop that looks at all source operands to decide vm_wait_cnt - // and lgk_wait_cnt. - - // Two cases are handled for destination operands: - // 1) If the destination operand was defined by a load, add the s_waitcnt - // instruction to guarantee the right WAW order. - // 2) If a destination operand that was used by a recent export/store ins, - // add s_waitcnt on exp_cnt to guarantee the WAR order. - if (MI.mayStore()) { - // FIXME: Should not be relying on memoperands. - for (const MachineMemOperand *Memop : MI.memoperands()) { - const Value *Ptr = Memop->getValue(); - if (SLoadAddresses.count(Ptr)) { - addWait(Wait, LGKM_CNT, 0); - if (PDT->dominates(MI.getParent(), - SLoadAddresses.find(Ptr)->second)) - SLoadAddresses.erase(Ptr); - } - unsigned AS = Memop->getAddrSpace(); - if (AS != AMDGPUAS::LOCAL_ADDRESS) - continue; - unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; - ScoreBrackets.determineWait( - VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); + if (Memop->isStore()) { ScoreBrackets.determineWait( EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait); } } + + // Loop over use and def operands. for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { - MachineOperand &Def = MI.getOperand(I); - const MachineRegisterInfo &MRIA = *MRI; + MachineOperand &Op = MI.getOperand(I); + if (!Op.isReg()) + continue; RegInterval Interval = - ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true); - for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - if (TRI->isVGPR(MRIA, Def.getReg())) { - ScoreBrackets.determineWait( - VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); - ScoreBrackets.determineWait( - EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait); + ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I); + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + if (TRI->isVGPR(*MRI, Op.getReg())) { + // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the + // previous write and this write are the same type of VMEM + // instruction, in which case they're guaranteed to write their + // results in order anyway. + if (Op.isUse() || !SIInstrInfo::isVMEM(MI) || + ScoreBrackets.hasOtherPendingVmemTypes(RegNo, + getVmemType(MI))) { + ScoreBrackets.determineWait( + VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); + ScoreBrackets.clearVgprVmemTypes(RegNo); + } + if (Op.isDef()) { + ScoreBrackets.determineWait( + EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait); + } } ScoreBrackets.determineWait( LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); } - } // End of for loop that looks at all dest operands. + } } } @@ -1154,7 +1149,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( } LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" - << "Old Instr: " << MI << '\n' + << "Old Instr: " << MI << "New Instr: " << *II << '\n'); if (!Wait.hasWait()) @@ -1171,7 +1166,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( Modified = true; LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" - << "Old Instr: " << MI << '\n' + << "Old Instr: " << MI << "New Instr: " << *SWaitInst << '\n'); } @@ -1187,7 +1182,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( Modified = true; LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" - << "Old Instr: " << MI << '\n' + << "Old Instr: " << MI << "New Instr: " << *SWaitInst << '\n'); } @@ -1303,10 +1298,10 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, } } -bool WaitcntBrackets::mergeScore(const MergeInfo &M, uint32_t &Score, - uint32_t OtherScore) { - uint32_t MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift; - uint32_t OtherShifted = +bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score, + unsigned OtherScore) { + unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift; + unsigned OtherShifted = OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift; Score = std::max(MyShifted, OtherShifted); return OtherShifted > MyShifted; @@ -1320,44 +1315,50 @@ bool WaitcntBrackets::mergeScore(const MergeInfo &M, uint32_t &Score, bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { bool StrictDom = false; + VgprUB = std::max(VgprUB, Other.VgprUB); + SgprUB = std::max(SgprUB, Other.SgprUB); + for (auto T : inst_counter_types()) { // Merge event flags for this counter const bool OldOutOfOrder = counterOutOfOrder(T); - const uint32_t OldEvents = PendingEvents & WaitEventMaskForInst[T]; - const uint32_t OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T]; + const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T]; + const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T]; if (OtherEvents & ~OldEvents) StrictDom = true; - if (Other.MixedPendingEvents[T] || - (OldEvents && OtherEvents && OldEvents != OtherEvents)) - MixedPendingEvents[T] = true; PendingEvents |= OtherEvents; // Merge scores for this counter - const uint32_t MyPending = ScoreUBs[T] - ScoreLBs[T]; - const uint32_t OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T]; + const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T]; + const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T]; + const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending); + if (NewUB < ScoreLBs[T]) + report_fatal_error("waitcnt score overflow"); + MergeInfo M; M.OldLB = ScoreLBs[T]; M.OtherLB = Other.ScoreLBs[T]; - M.MyShift = OtherPending > MyPending ? OtherPending - MyPending : 0; - M.OtherShift = ScoreUBs[T] - Other.ScoreUBs[T] + M.MyShift; + M.MyShift = NewUB - ScoreUBs[T]; + M.OtherShift = NewUB - Other.ScoreUBs[T]; - const uint32_t NewUB = ScoreUBs[T] + M.MyShift; - if (NewUB < ScoreUBs[T]) - report_fatal_error("waitcnt score overflow"); ScoreUBs[T] = NewUB; - ScoreLBs[T] = std::min(M.OldLB + M.MyShift, M.OtherLB + M.OtherShift); StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]); bool RegStrictDom = false; - for (int J = 0, E = std::max(getMaxVGPR(), Other.getMaxVGPR()) + 1; J != E; - J++) { + for (int J = 0; J <= VgprUB; J++) { RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]); } + if (T == VM_CNT) { + for (int J = 0; J <= VgprUB; J++) { + unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J]; + RegStrictDom |= NewVmemTypes != VgprVmemTypes[J]; + VgprVmemTypes[J] = NewVmemTypes; + } + } + if (T == LGKM_CNT) { - for (int J = 0, E = std::max(getMaxSGPR(), Other.getMaxSGPR()) + 1; - J != E; J++) { + for (int J = 0; J <= SgprUB; J++) { RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]); } } @@ -1366,9 +1367,6 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { StrictDom = true; } - VgprUB = std::max(getMaxVGPR(), Other.getMaxVGPR()); - SgprUB = std::max(getMaxSGPR(), Other.getMaxSGPR()); - return StrictDom; } @@ -1383,6 +1381,10 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, ScoreBrackets.dump(); }); + // Assume VCCZ is correct at basic block boundaries, unless and until we need + // to handle cases where that is not true. + bool VCCZCorrect = true; + // Walk over the instructions. MachineInstr *OldWaitcntInstr = nullptr; @@ -1402,13 +1404,26 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, continue; } - bool VCCZBugWorkAround = false; + // We might need to restore vccz to its correct value for either of two + // different reasons; see ST->hasReadVCCZBug() and + // ST->partialVCCWritesUpdateVCCZ(). + bool RestoreVCCZ = false; if (readsVCCZ(Inst)) { - if (ScoreBrackets.getScoreLB(LGKM_CNT) < - ScoreBrackets.getScoreUB(LGKM_CNT) && - ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { - if (ST->hasReadVCCZBug()) - VCCZBugWorkAround = true; + if (!VCCZCorrect) + RestoreVCCZ = true; + else if (ST->hasReadVCCZBug()) { + // There is a hardware bug on CI/SI where SMRD instruction may corrupt + // vccz bit, so when we detect that an instruction may read from a + // corrupt vccz bit, we need to: + // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD + // operations to complete. + // 2. Restore the correct value of vccz by writing the current value + // of vcc back to vcc. + if (ScoreBrackets.getScoreLB(LGKM_CNT) < + ScoreBrackets.getScoreUB(LGKM_CNT) && + ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { + RestoreVCCZ = true; + } } } @@ -1419,6 +1434,16 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, } } + if (!ST->partialVCCWritesUpdateVCCZ()) { + // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz. + // Writes to vcc will fix it. + if (Inst.definesRegister(AMDGPU::VCC_LO) || + Inst.definesRegister(AMDGPU::VCC_HI)) + VCCZCorrect = false; + else if (Inst.definesRegister(AMDGPU::VCC)) + VCCZCorrect = true; + } + // Generate an s_waitcnt instruction to be placed before // cur_Inst, if needed. Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr); @@ -1444,7 +1469,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, // TODO: Remove this work-around after fixing the scheduler and enable the // assert above. - if (VCCZBugWorkAround) { + if (RestoreVCCZ) { // Restore the vccz bit. Any time a value is written to vcc, the vcc // bit is updated, so we can restore the bit by reading the value of // vcc and then writing it back to the register. @@ -1452,6 +1477,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), TRI->getVCC()) .addReg(TRI->getVCC()); + VCCZCorrect = true; Modified = true; } @@ -1479,29 +1505,23 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV); HardwareLimits.VscntMax = ST->hasVscnt() ? 63 : 0; - HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs(); - HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs(); - assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS); - assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS); + unsigned NumVGPRsMax = ST->getAddressableNumVGPRs(); + unsigned NumSGPRsMax = ST->getAddressableNumSGPRs(); + assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS); + assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS); RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0); - RegisterEncoding.VGPRL = - RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1; + RegisterEncoding.VGPRL = RegisterEncoding.VGPR0 + NumVGPRsMax - 1; RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0); - RegisterEncoding.SGPRL = - RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1; + RegisterEncoding.SGPRL = RegisterEncoding.SGPR0 + NumSGPRsMax - 1; TrackedWaitcntSet.clear(); - RpotIdxMap.clear(); BlockInfos.clear(); // Keep iterating over the blocks in reverse post order, inserting and // updating s_waitcnt where needed, until a fix point is reached. - for (MachineBasicBlock *MBB : - ReversePostOrderTraversal<MachineFunction *>(&MF)) { - RpotIdxMap[MBB] = BlockInfos.size(); - BlockInfos.emplace_back(MBB); - } + for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF)) + BlockInfos.insert({MBB, BlockInfo(MBB)}); std::unique_ptr<WaitcntBrackets> Brackets; bool Modified = false; @@ -1509,12 +1529,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { do { Repeat = false; - for (BlockInfo &BI : BlockInfos) { + for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE; + ++BII) { + BlockInfo &BI = BII->second; if (!BI.Dirty) continue; - unsigned Idx = std::distance(&*BlockInfos.begin(), &BI); - if (BI.Incoming) { if (!Brackets) Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming); @@ -1524,7 +1544,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { if (!Brackets) Brackets = std::make_unique<WaitcntBrackets>(ST); else - Brackets->clear(); + *Brackets = WaitcntBrackets(ST); } Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets); @@ -1533,11 +1553,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { if (Brackets->hasPending()) { BlockInfo *MoveBracketsToSucc = nullptr; for (MachineBasicBlock *Succ : BI.MBB->successors()) { - unsigned SuccIdx = RpotIdxMap[Succ]; - BlockInfo &SuccBI = BlockInfos[SuccIdx]; + auto SuccBII = BlockInfos.find(Succ); + BlockInfo &SuccBI = SuccBII->second; if (!SuccBI.Incoming) { SuccBI.Dirty = true; - if (SuccIdx <= Idx) + if (SuccBII <= BII) Repeat = true; if (!MoveBracketsToSucc) { MoveBracketsToSucc = &SuccBI; @@ -1546,7 +1566,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { } } else if (SuccBI.Incoming->merge(*Brackets)) { SuccBI.Dirty = true; - if (SuccIdx <= Idx) + if (SuccBII <= BII) Repeat = true; } } @@ -1612,13 +1632,15 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { // TODO: Could insert earlier and schedule more liberally with operations // that only use caller preserved registers. MachineBasicBlock &EntryBB = MF.front(); + MachineBasicBlock::iterator I = EntryBB.begin(); + for (MachineBasicBlock::iterator E = EntryBB.end(); + I != E && (I->isPHI() || I->isMetaInstruction()); ++I) + ; + BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); if (ST->hasVscnt()) - BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), - TII->get(AMDGPU::S_WAITCNT_VSCNT)) - .addReg(AMDGPU::SGPR_NULL, RegState::Undef) - .addImm(0); - BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) - .addImm(0); + BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(0); Modified = true; } |