summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp496
1 files changed, 259 insertions, 237 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ef662d55cb0a9..2a157eb20ab47 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -32,6 +32,7 @@
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
@@ -57,7 +58,6 @@
#include <cstring>
#include <memory>
#include <utility>
-#include <vector>
using namespace llvm;
@@ -109,15 +109,13 @@ iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
enum_iterator<InstCounterType>(NUM_INST_CNTS));
}
-using RegInterval = std::pair<signed, signed>;
+using RegInterval = std::pair<int, int>;
struct {
- uint32_t VmcntMax;
- uint32_t ExpcntMax;
- uint32_t LgkmcntMax;
- uint32_t VscntMax;
- int32_t NumVGPRsMax;
- int32_t NumSGPRsMax;
+ unsigned VmcntMax;
+ unsigned ExpcntMax;
+ unsigned LgkmcntMax;
+ unsigned VscntMax;
} HardwareLimits;
struct {
@@ -143,7 +141,7 @@ enum WaitEventType {
NUM_WAIT_EVENTS,
};
-static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = {
+static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
(1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
(1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
(1 << SQ_MESSAGE),
@@ -166,6 +164,28 @@ enum RegisterMapping {
NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
};
+// Enumerate different types of result-returning VMEM operations. Although
+// s_waitcnt orders them all with a single vmcnt counter, in the absence of
+// s_waitcnt only instructions of the same VmemType are guaranteed to write
+// their results in order -- so there is no need to insert an s_waitcnt between
+// two instructions of the same type that write the same vgpr.
+enum VmemType {
+ // BUF instructions and MIMG instructions without a sampler.
+ VMEM_NOSAMPLER,
+ // MIMG instructions with a sampler.
+ VMEM_SAMPLER,
+};
+
+VmemType getVmemType(const MachineInstr &Inst) {
+ assert(SIInstrInfo::isVMEM(Inst));
+ if (!SIInstrInfo::isMIMG(Inst))
+ return VMEM_NOSAMPLER;
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
+ return AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler
+ ? VMEM_SAMPLER
+ : VMEM_NOSAMPLER;
+}
+
void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
switch (T) {
case VM_CNT:
@@ -195,12 +215,9 @@ void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
// "s_waitcnt 0" before use.
class WaitcntBrackets {
public:
- WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
- for (auto T : inst_counter_types())
- memset(VgprScores[T], 0, sizeof(VgprScores[T]));
- }
+ WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {}
- static uint32_t getWaitCountMax(InstCounterType T) {
+ static unsigned getWaitCountMax(InstCounterType T) {
switch (T) {
case VM_CNT:
return HardwareLimits.VmcntMax;
@@ -216,17 +233,13 @@ public:
return 0;
}
- uint32_t getScoreLB(InstCounterType T) const {
+ unsigned getScoreLB(InstCounterType T) const {
assert(T < NUM_INST_CNTS);
- if (T >= NUM_INST_CNTS)
- return 0;
return ScoreLBs[T];
}
- uint32_t getScoreUB(InstCounterType T) const {
+ unsigned getScoreUB(InstCounterType T) const {
assert(T < NUM_INST_CNTS);
- if (T >= NUM_INST_CNTS)
- return 0;
return ScoreUBs[T];
}
@@ -242,7 +255,7 @@ public:
return EXP_CNT;
}
- uint32_t getRegScore(int GprNo, InstCounterType T) {
+ unsigned getRegScore(int GprNo, InstCounterType T) {
if (GprNo < NUM_ALL_VGPRS) {
return VgprScores[T][GprNo];
}
@@ -250,30 +263,16 @@ public:
return SgprScores[GprNo - NUM_ALL_VGPRS];
}
- void clear() {
- memset(ScoreLBs, 0, sizeof(ScoreLBs));
- memset(ScoreUBs, 0, sizeof(ScoreUBs));
- PendingEvents = 0;
- memset(MixedPendingEvents, 0, sizeof(MixedPendingEvents));
- for (auto T : inst_counter_types())
- memset(VgprScores[T], 0, sizeof(VgprScores[T]));
- memset(SgprScores, 0, sizeof(SgprScores));
- }
-
bool merge(const WaitcntBrackets &Other);
RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
const MachineRegisterInfo *MRI,
- const SIRegisterInfo *TRI, unsigned OpNo,
- bool Def) const;
-
- int32_t getMaxVGPR() const { return VgprUB; }
- int32_t getMaxSGPR() const { return SgprUB; }
+ const SIRegisterInfo *TRI, unsigned OpNo) const;
bool counterOutOfOrder(InstCounterType T) const;
bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
- void determineWait(InstCounterType T, uint32_t ScoreToWait,
+ void determineWait(InstCounterType T, unsigned ScoreToWait,
AMDGPU::Waitcnt &Wait) const;
void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
void applyWaitcnt(InstCounterType T, unsigned Count);
@@ -286,6 +285,12 @@ public:
return PendingEvents & (1 << E);
}
+ bool hasMixedPendingEvents(InstCounterType T) const {
+ unsigned Events = PendingEvents & WaitEventMaskForInst[T];
+ // Return true if more than one bit is set in Events.
+ return Events & (Events - 1);
+ }
+
bool hasPendingFlat() const {
return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
@@ -298,71 +303,77 @@ public:
LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
}
+ // Return true if there might be pending writes to the specified vgpr by VMEM
+ // instructions with types different from V.
+ bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
+ assert(GprNo < NUM_ALL_VGPRS);
+ return VgprVmemTypes[GprNo] & ~(1 << V);
+ }
+
+ void clearVgprVmemTypes(int GprNo) {
+ assert(GprNo < NUM_ALL_VGPRS);
+ VgprVmemTypes[GprNo] = 0;
+ }
+
void print(raw_ostream &);
void dump() { print(dbgs()); }
private:
struct MergeInfo {
- uint32_t OldLB;
- uint32_t OtherLB;
- uint32_t MyShift;
- uint32_t OtherShift;
+ unsigned OldLB;
+ unsigned OtherLB;
+ unsigned MyShift;
+ unsigned OtherShift;
};
- static bool mergeScore(const MergeInfo &M, uint32_t &Score,
- uint32_t OtherScore);
+ static bool mergeScore(const MergeInfo &M, unsigned &Score,
+ unsigned OtherScore);
- void setScoreLB(InstCounterType T, uint32_t Val) {
+ void setScoreLB(InstCounterType T, unsigned Val) {
assert(T < NUM_INST_CNTS);
- if (T >= NUM_INST_CNTS)
- return;
ScoreLBs[T] = Val;
}
- void setScoreUB(InstCounterType T, uint32_t Val) {
+ void setScoreUB(InstCounterType T, unsigned Val) {
assert(T < NUM_INST_CNTS);
- if (T >= NUM_INST_CNTS)
- return;
ScoreUBs[T] = Val;
if (T == EXP_CNT) {
- uint32_t UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
+ unsigned UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
if (ScoreLBs[T] < UB && UB < ScoreUBs[T])
ScoreLBs[T] = UB;
}
}
- void setRegScore(int GprNo, InstCounterType T, uint32_t Val) {
+ void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
if (GprNo < NUM_ALL_VGPRS) {
- if (GprNo > VgprUB) {
- VgprUB = GprNo;
- }
+ VgprUB = std::max(VgprUB, GprNo);
VgprScores[T][GprNo] = Val;
} else {
assert(T == LGKM_CNT);
- if (GprNo - NUM_ALL_VGPRS > SgprUB) {
- SgprUB = GprNo - NUM_ALL_VGPRS;
- }
+ SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
}
}
void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
- unsigned OpNo, uint32_t Val);
+ unsigned OpNo, unsigned Val);
const GCNSubtarget *ST = nullptr;
- uint32_t ScoreLBs[NUM_INST_CNTS] = {0};
- uint32_t ScoreUBs[NUM_INST_CNTS] = {0};
- uint32_t PendingEvents = 0;
- bool MixedPendingEvents[NUM_INST_CNTS] = {false};
+ unsigned ScoreLBs[NUM_INST_CNTS] = {0};
+ unsigned ScoreUBs[NUM_INST_CNTS] = {0};
+ unsigned PendingEvents = 0;
// Remember the last flat memory operation.
- uint32_t LastFlat[NUM_INST_CNTS] = {0};
+ unsigned LastFlat[NUM_INST_CNTS] = {0};
// wait_cnt scores for every vgpr.
// Keep track of the VgprUB and SgprUB to make merge at join efficient.
- int32_t VgprUB = 0;
- int32_t SgprUB = 0;
- uint32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
+ int VgprUB = -1;
+ int SgprUB = -1;
+ unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
// Wait cnt scores for every sgpr, only lgkmcnt is relevant.
- uint32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
+ unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
+ // Bitmask of the VmemTypes of VMEM instructions that might have a pending
+ // write to each vgpr.
+ unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
};
class SIInsertWaitcnts : public MachineFunctionPass {
@@ -385,8 +396,7 @@ private:
explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {}
};
- std::vector<BlockInfo> BlockInfos; // by reverse post-order traversal index
- DenseMap<MachineBasicBlock *, unsigned> RpotIdxMap;
+ MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
// ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
// because of amdgpu-waitcnt-forcezero flag
@@ -464,10 +474,10 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
const SIInstrInfo *TII,
const MachineRegisterInfo *MRI,
const SIRegisterInfo *TRI,
- unsigned OpNo, bool Def) const {
+ unsigned OpNo) const {
const MachineOperand &Op = MI->getOperand(OpNo);
- if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
- (Def && !Op.isDef()) || TRI->isAGPR(*MRI, Op.getReg()))
+ assert(Op.isReg());
+ if (!TRI->isInAllocatableClass(Op.getReg()) || TRI->isAGPR(*MRI, Op.getReg()))
return {-1, -1};
// A use via a PW operand does not need a waitcnt.
@@ -475,29 +485,27 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
assert(!Op.getSubReg() || !Op.isUndef());
RegInterval Result;
- const MachineRegisterInfo &MRIA = *MRI;
unsigned Reg = TRI->getEncodingValue(Op.getReg());
- if (TRI->isVGPR(MRIA, Op.getReg())) {
+ if (TRI->isVGPR(*MRI, Op.getReg())) {
assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
Result.first = Reg - RegisterEncoding.VGPR0;
assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
- } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
+ } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
assert(Result.first >= NUM_ALL_VGPRS &&
Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
}
// TODO: Handle TTMP
- // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
+ // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
else
return {-1, -1};
- const MachineInstr &MIA = *MI;
- const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
+ const TargetRegisterClass *RC = TII->getOpRegClass(*MI, OpNo);
unsigned Size = TRI->getRegSizeInBits(*RC);
- Result.second = Result.first + (Size / 32);
+ Result.second = Result.first + ((Size + 16) / 32);
return Result;
}
@@ -506,13 +514,10 @@ void WaitcntBrackets::setExpScore(const MachineInstr *MI,
const SIInstrInfo *TII,
const SIRegisterInfo *TRI,
const MachineRegisterInfo *MRI, unsigned OpNo,
- uint32_t Val) {
- RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
- LLVM_DEBUG({
- const MachineOperand &Opnd = MI->getOperand(OpNo);
- assert(TRI->isVGPR(*MRI, Opnd.getReg()));
- });
- for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+ unsigned Val) {
+ RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo);
+ assert(TRI->isVGPR(*MRI, MI->getOperand(OpNo).getReg()));
+ for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
setRegScore(RegNo, EXP_CNT, Val);
}
}
@@ -521,19 +526,14 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
const SIRegisterInfo *TRI,
const MachineRegisterInfo *MRI,
WaitEventType E, MachineInstr &Inst) {
- const MachineRegisterInfo &MRIA = *MRI;
InstCounterType T = eventCounter(E);
- uint32_t CurrScore = getScoreUB(T) + 1;
+ unsigned CurrScore = getScoreUB(T) + 1;
if (CurrScore == 0)
report_fatal_error("InsertWaitcnt score wraparound");
// PendingEvents and ScoreUB need to be update regardless if this event
// changes the score of a register or not.
// Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
- if (!hasPendingEvent(E)) {
- if (PendingEvents & WaitEventMaskForInst[T])
- MixedPendingEvents[T] = true;
- PendingEvents |= 1 << E;
- }
+ PendingEvents |= 1 << E;
setScoreUB(T, CurrScore);
if (T == EXP_CNT) {
@@ -574,7 +574,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
const MachineOperand &Op = Inst.getOperand(I);
- if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
+ if (Op.isReg() && !Op.isDef() && TRI->isVGPR(*MRI, Op.getReg())) {
setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
}
}
@@ -622,7 +622,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
MachineOperand &DefMO = Inst.getOperand(I);
if (DefMO.isReg() && DefMO.isDef() &&
- TRI->isVGPR(MRIA, DefMO.getReg())) {
+ TRI->isVGPR(*MRI, DefMO.getReg())) {
setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
CurrScore);
}
@@ -630,7 +630,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
}
for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
MachineOperand &MO = Inst.getOperand(I);
- if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
+ if (MO.isReg() && !MO.isDef() && TRI->isVGPR(*MRI, MO.getReg())) {
setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
}
}
@@ -641,8 +641,8 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
unsigned OpNo;//TODO: find the OpNo for this operand;
- RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
- for (signed RegNo = Interval.first; RegNo < Interval.second;
+ RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo);
+ for (int RegNo = Interval.first; RegNo < Interval.second;
++RegNo) {
setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
}
@@ -650,10 +650,20 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
} else {
// Match the score to the destination registers.
for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
- RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
- if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
+ auto &Op = Inst.getOperand(I);
+ if (!Op.isReg() || !Op.isDef())
continue;
- for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+ RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I);
+ if (T == VM_CNT) {
+ if (Interval.first >= NUM_ALL_VGPRS)
+ continue;
+ if (SIInstrInfo::isVMEM(Inst)) {
+ VmemType V = getVmemType(Inst);
+ for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
+ VgprVmemTypes[RegNo] |= 1 << V;
+ }
+ }
+ for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
setRegScore(RegNo, T, CurrScore);
}
}
@@ -666,8 +676,8 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
void WaitcntBrackets::print(raw_ostream &OS) {
OS << '\n';
for (auto T : inst_counter_types()) {
- uint32_t LB = getScoreLB(T);
- uint32_t UB = getScoreUB(T);
+ unsigned LB = getScoreLB(T);
+ unsigned UB = getScoreUB(T);
switch (T) {
case VM_CNT:
@@ -689,11 +699,11 @@ void WaitcntBrackets::print(raw_ostream &OS) {
if (LB < UB) {
// Print vgpr scores.
- for (int J = 0; J <= getMaxVGPR(); J++) {
- uint32_t RegScore = getRegScore(J, T);
+ for (int J = 0; J <= VgprUB; J++) {
+ unsigned RegScore = getRegScore(J, T);
if (RegScore <= LB)
continue;
- uint32_t RelScore = RegScore - LB - 1;
+ unsigned RelScore = RegScore - LB - 1;
if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
OS << RelScore << ":v" << J << " ";
} else {
@@ -702,11 +712,11 @@ void WaitcntBrackets::print(raw_ostream &OS) {
}
// Also need to print sgpr scores for lgkm_cnt.
if (T == LGKM_CNT) {
- for (int J = 0; J <= getMaxSGPR(); J++) {
- uint32_t RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
+ for (int J = 0; J <= SgprUB; J++) {
+ unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
if (RegScore <= LB)
continue;
- uint32_t RelScore = RegScore - LB - 1;
+ unsigned RelScore = RegScore - LB - 1;
OS << RelScore << ":s" << J << " ";
}
}
@@ -727,8 +737,8 @@ bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
unsigned &Count) const {
- const uint32_t LB = getScoreLB(T);
- const uint32_t UB = getScoreUB(T);
+ const unsigned LB = getScoreLB(T);
+ const unsigned UB = getScoreUB(T);
if (Count < UB && UB - Count > LB)
return true;
@@ -736,12 +746,12 @@ bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
return false;
}
-void WaitcntBrackets::determineWait(InstCounterType T, uint32_t ScoreToWait,
+void WaitcntBrackets::determineWait(InstCounterType T, unsigned ScoreToWait,
AMDGPU::Waitcnt &Wait) const {
// If the score of src_operand falls within the bracket, we need an
// s_waitcnt instruction.
- const uint32_t LB = getScoreLB(T);
- const uint32_t UB = getScoreUB(T);
+ const unsigned LB = getScoreLB(T);
+ const unsigned UB = getScoreUB(T);
if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
if ((T == VM_CNT || T == LGKM_CNT) &&
hasPendingFlat() &&
@@ -758,7 +768,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, uint32_t ScoreToWait,
} else {
// If a counter has been maxed out avoid overflow by waiting for
// MAX(CounterType) - 1 instead.
- uint32_t NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
+ unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
addWait(Wait, T, NeededWait);
}
}
@@ -772,7 +782,7 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
}
void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
- const uint32_t UB = getScoreUB(T);
+ const unsigned UB = getScoreUB(T);
if (Count >= UB)
return;
if (Count != 0) {
@@ -781,7 +791,6 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
setScoreLB(T, std::max(getScoreLB(T), UB - Count));
} else {
setScoreLB(T, UB);
- MixedPendingEvents[T] = false;
PendingEvents &= ~WaitEventMaskForInst[T];
}
}
@@ -792,7 +801,7 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
// Scalar memory read always can go out of order.
if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
return true;
- return MixedPendingEvents[T];
+ return hasMixedPendingEvents(T);
}
INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
@@ -954,10 +963,10 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
int CallAddrOpIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
- RegInterval CallAddrOpInterval = ScoreBrackets.getRegInterval(
- &MI, TII, MRI, TRI, CallAddrOpIdx, false);
+ RegInterval CallAddrOpInterval =
+ ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, CallAddrOpIdx);
- for (signed RegNo = CallAddrOpInterval.first;
+ for (int RegNo = CallAddrOpInterval.first;
RegNo < CallAddrOpInterval.second; ++RegNo)
ScoreBrackets.determineWait(
LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
@@ -965,10 +974,10 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
int RtnAddrOpIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
if (RtnAddrOpIdx != -1) {
- RegInterval RtnAddrOpInterval = ScoreBrackets.getRegInterval(
- &MI, TII, MRI, TRI, RtnAddrOpIdx, false);
+ RegInterval RtnAddrOpInterval =
+ ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, RtnAddrOpIdx);
- for (signed RegNo = RtnAddrOpInterval.first;
+ for (int RegNo = RtnAddrOpInterval.first;
RegNo < RtnAddrOpInterval.second; ++RegNo)
ScoreBrackets.determineWait(
LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
@@ -982,7 +991,19 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
// emitted.
// If the source operand was defined by a load, add the s_waitcnt
// instruction.
+ //
+ // Two cases are handled for destination operands:
+ // 1) If the destination operand was defined by a load, add the s_waitcnt
+ // instruction to guarantee the right WAW order.
+ // 2) If a destination operand that was used by a recent export/store ins,
+ // add s_waitcnt on exp_cnt to guarantee the WAR order.
for (const MachineMemOperand *Memop : MI.memoperands()) {
+ const Value *Ptr = Memop->getValue();
+ if (Memop->isStore() && SLoadAddresses.count(Ptr)) {
+ addWait(Wait, LGKM_CNT, 0);
+ if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second))
+ SLoadAddresses.erase(Ptr);
+ }
unsigned AS = Memop->getAddrSpace();
if (AS != AMDGPUAS::LOCAL_ADDRESS)
continue;
@@ -990,67 +1011,41 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
// VM_CNT is only relevant to vgpr or LDS.
ScoreBrackets.determineWait(
VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
- }
-
- for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
- const MachineOperand &Op = MI.getOperand(I);
- const MachineRegisterInfo &MRIA = *MRI;
- RegInterval Interval =
- ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false);
- for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
- if (TRI->isVGPR(MRIA, Op.getReg())) {
- // VM_CNT is only relevant to vgpr or LDS.
- ScoreBrackets.determineWait(
- VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
- }
- ScoreBrackets.determineWait(
- LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
- }
- }
- // End of for loop that looks at all source operands to decide vm_wait_cnt
- // and lgk_wait_cnt.
-
- // Two cases are handled for destination operands:
- // 1) If the destination operand was defined by a load, add the s_waitcnt
- // instruction to guarantee the right WAW order.
- // 2) If a destination operand that was used by a recent export/store ins,
- // add s_waitcnt on exp_cnt to guarantee the WAR order.
- if (MI.mayStore()) {
- // FIXME: Should not be relying on memoperands.
- for (const MachineMemOperand *Memop : MI.memoperands()) {
- const Value *Ptr = Memop->getValue();
- if (SLoadAddresses.count(Ptr)) {
- addWait(Wait, LGKM_CNT, 0);
- if (PDT->dominates(MI.getParent(),
- SLoadAddresses.find(Ptr)->second))
- SLoadAddresses.erase(Ptr);
- }
- unsigned AS = Memop->getAddrSpace();
- if (AS != AMDGPUAS::LOCAL_ADDRESS)
- continue;
- unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
- ScoreBrackets.determineWait(
- VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+ if (Memop->isStore()) {
ScoreBrackets.determineWait(
EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
}
}
+
+ // Loop over use and def operands.
for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
- MachineOperand &Def = MI.getOperand(I);
- const MachineRegisterInfo &MRIA = *MRI;
+ MachineOperand &Op = MI.getOperand(I);
+ if (!Op.isReg())
+ continue;
RegInterval Interval =
- ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true);
- for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
- if (TRI->isVGPR(MRIA, Def.getReg())) {
- ScoreBrackets.determineWait(
- VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
- ScoreBrackets.determineWait(
- EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
+ ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I);
+ for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+ if (TRI->isVGPR(*MRI, Op.getReg())) {
+ // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
+ // previous write and this write are the same type of VMEM
+ // instruction, in which case they're guaranteed to write their
+ // results in order anyway.
+ if (Op.isUse() || !SIInstrInfo::isVMEM(MI) ||
+ ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
+ getVmemType(MI))) {
+ ScoreBrackets.determineWait(
+ VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+ ScoreBrackets.clearVgprVmemTypes(RegNo);
+ }
+ if (Op.isDef()) {
+ ScoreBrackets.determineWait(
+ EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
+ }
}
ScoreBrackets.determineWait(
LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
}
- } // End of for loop that looks at all dest operands.
+ }
}
}
@@ -1154,7 +1149,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
}
LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
- << "Old Instr: " << MI << '\n'
+ << "Old Instr: " << MI
<< "New Instr: " << *II << '\n');
if (!Wait.hasWait())
@@ -1171,7 +1166,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
Modified = true;
LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
- << "Old Instr: " << MI << '\n'
+ << "Old Instr: " << MI
<< "New Instr: " << *SWaitInst << '\n');
}
@@ -1187,7 +1182,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
Modified = true;
LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
- << "Old Instr: " << MI << '\n'
+ << "Old Instr: " << MI
<< "New Instr: " << *SWaitInst << '\n');
}
@@ -1303,10 +1298,10 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
}
}
-bool WaitcntBrackets::mergeScore(const MergeInfo &M, uint32_t &Score,
- uint32_t OtherScore) {
- uint32_t MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
- uint32_t OtherShifted =
+bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
+ unsigned OtherScore) {
+ unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
+ unsigned OtherShifted =
OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
Score = std::max(MyShifted, OtherShifted);
return OtherShifted > MyShifted;
@@ -1320,44 +1315,50 @@ bool WaitcntBrackets::mergeScore(const MergeInfo &M, uint32_t &Score,
bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
bool StrictDom = false;
+ VgprUB = std::max(VgprUB, Other.VgprUB);
+ SgprUB = std::max(SgprUB, Other.SgprUB);
+
for (auto T : inst_counter_types()) {
// Merge event flags for this counter
const bool OldOutOfOrder = counterOutOfOrder(T);
- const uint32_t OldEvents = PendingEvents & WaitEventMaskForInst[T];
- const uint32_t OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
+ const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
+ const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
if (OtherEvents & ~OldEvents)
StrictDom = true;
- if (Other.MixedPendingEvents[T] ||
- (OldEvents && OtherEvents && OldEvents != OtherEvents))
- MixedPendingEvents[T] = true;
PendingEvents |= OtherEvents;
// Merge scores for this counter
- const uint32_t MyPending = ScoreUBs[T] - ScoreLBs[T];
- const uint32_t OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
+ const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
+ const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
+ const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
+ if (NewUB < ScoreLBs[T])
+ report_fatal_error("waitcnt score overflow");
+
MergeInfo M;
M.OldLB = ScoreLBs[T];
M.OtherLB = Other.ScoreLBs[T];
- M.MyShift = OtherPending > MyPending ? OtherPending - MyPending : 0;
- M.OtherShift = ScoreUBs[T] - Other.ScoreUBs[T] + M.MyShift;
+ M.MyShift = NewUB - ScoreUBs[T];
+ M.OtherShift = NewUB - Other.ScoreUBs[T];
- const uint32_t NewUB = ScoreUBs[T] + M.MyShift;
- if (NewUB < ScoreUBs[T])
- report_fatal_error("waitcnt score overflow");
ScoreUBs[T] = NewUB;
- ScoreLBs[T] = std::min(M.OldLB + M.MyShift, M.OtherLB + M.OtherShift);
StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
bool RegStrictDom = false;
- for (int J = 0, E = std::max(getMaxVGPR(), Other.getMaxVGPR()) + 1; J != E;
- J++) {
+ for (int J = 0; J <= VgprUB; J++) {
RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
}
+ if (T == VM_CNT) {
+ for (int J = 0; J <= VgprUB; J++) {
+ unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
+ RegStrictDom |= NewVmemTypes != VgprVmemTypes[J];
+ VgprVmemTypes[J] = NewVmemTypes;
+ }
+ }
+
if (T == LGKM_CNT) {
- for (int J = 0, E = std::max(getMaxSGPR(), Other.getMaxSGPR()) + 1;
- J != E; J++) {
+ for (int J = 0; J <= SgprUB; J++) {
RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
}
}
@@ -1366,9 +1367,6 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
StrictDom = true;
}
- VgprUB = std::max(getMaxVGPR(), Other.getMaxVGPR());
- SgprUB = std::max(getMaxSGPR(), Other.getMaxSGPR());
-
return StrictDom;
}
@@ -1383,6 +1381,10 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
ScoreBrackets.dump();
});
+ // Assume VCCZ is correct at basic block boundaries, unless and until we need
+ // to handle cases where that is not true.
+ bool VCCZCorrect = true;
+
// Walk over the instructions.
MachineInstr *OldWaitcntInstr = nullptr;
@@ -1402,13 +1404,26 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
continue;
}
- bool VCCZBugWorkAround = false;
+ // We might need to restore vccz to its correct value for either of two
+ // different reasons; see ST->hasReadVCCZBug() and
+ // ST->partialVCCWritesUpdateVCCZ().
+ bool RestoreVCCZ = false;
if (readsVCCZ(Inst)) {
- if (ScoreBrackets.getScoreLB(LGKM_CNT) <
- ScoreBrackets.getScoreUB(LGKM_CNT) &&
- ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
- if (ST->hasReadVCCZBug())
- VCCZBugWorkAround = true;
+ if (!VCCZCorrect)
+ RestoreVCCZ = true;
+ else if (ST->hasReadVCCZBug()) {
+ // There is a hardware bug on CI/SI where SMRD instruction may corrupt
+ // vccz bit, so when we detect that an instruction may read from a
+ // corrupt vccz bit, we need to:
+ // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
+ // operations to complete.
+ // 2. Restore the correct value of vccz by writing the current value
+ // of vcc back to vcc.
+ if (ScoreBrackets.getScoreLB(LGKM_CNT) <
+ ScoreBrackets.getScoreUB(LGKM_CNT) &&
+ ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
+ RestoreVCCZ = true;
+ }
}
}
@@ -1419,6 +1434,16 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
}
}
+ if (!ST->partialVCCWritesUpdateVCCZ()) {
+ // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
+ // Writes to vcc will fix it.
+ if (Inst.definesRegister(AMDGPU::VCC_LO) ||
+ Inst.definesRegister(AMDGPU::VCC_HI))
+ VCCZCorrect = false;
+ else if (Inst.definesRegister(AMDGPU::VCC))
+ VCCZCorrect = true;
+ }
+
// Generate an s_waitcnt instruction to be placed before
// cur_Inst, if needed.
Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
@@ -1444,7 +1469,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
// TODO: Remove this work-around after fixing the scheduler and enable the
// assert above.
- if (VCCZBugWorkAround) {
+ if (RestoreVCCZ) {
// Restore the vccz bit. Any time a value is written to vcc, the vcc
// bit is updated, so we can restore the bit by reading the value of
// vcc and then writing it back to the register.
@@ -1452,6 +1477,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
TRI->getVCC())
.addReg(TRI->getVCC());
+ VCCZCorrect = true;
Modified = true;
}
@@ -1479,29 +1505,23 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
HardwareLimits.VscntMax = ST->hasVscnt() ? 63 : 0;
- HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
- HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
- assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
- assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
+ unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
+ unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
+ assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
+ assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
- RegisterEncoding.VGPRL =
- RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
+ RegisterEncoding.VGPRL = RegisterEncoding.VGPR0 + NumVGPRsMax - 1;
RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
- RegisterEncoding.SGPRL =
- RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
+ RegisterEncoding.SGPRL = RegisterEncoding.SGPR0 + NumSGPRsMax - 1;
TrackedWaitcntSet.clear();
- RpotIdxMap.clear();
BlockInfos.clear();
// Keep iterating over the blocks in reverse post order, inserting and
// updating s_waitcnt where needed, until a fix point is reached.
- for (MachineBasicBlock *MBB :
- ReversePostOrderTraversal<MachineFunction *>(&MF)) {
- RpotIdxMap[MBB] = BlockInfos.size();
- BlockInfos.emplace_back(MBB);
- }
+ for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
+ BlockInfos.insert({MBB, BlockInfo(MBB)});
std::unique_ptr<WaitcntBrackets> Brackets;
bool Modified = false;
@@ -1509,12 +1529,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
do {
Repeat = false;
- for (BlockInfo &BI : BlockInfos) {
+ for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
+ ++BII) {
+ BlockInfo &BI = BII->second;
if (!BI.Dirty)
continue;
- unsigned Idx = std::distance(&*BlockInfos.begin(), &BI);
-
if (BI.Incoming) {
if (!Brackets)
Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
@@ -1524,7 +1544,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
if (!Brackets)
Brackets = std::make_unique<WaitcntBrackets>(ST);
else
- Brackets->clear();
+ *Brackets = WaitcntBrackets(ST);
}
Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
@@ -1533,11 +1553,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
if (Brackets->hasPending()) {
BlockInfo *MoveBracketsToSucc = nullptr;
for (MachineBasicBlock *Succ : BI.MBB->successors()) {
- unsigned SuccIdx = RpotIdxMap[Succ];
- BlockInfo &SuccBI = BlockInfos[SuccIdx];
+ auto SuccBII = BlockInfos.find(Succ);
+ BlockInfo &SuccBI = SuccBII->second;
if (!SuccBI.Incoming) {
SuccBI.Dirty = true;
- if (SuccIdx <= Idx)
+ if (SuccBII <= BII)
Repeat = true;
if (!MoveBracketsToSucc) {
MoveBracketsToSucc = &SuccBI;
@@ -1546,7 +1566,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
}
} else if (SuccBI.Incoming->merge(*Brackets)) {
SuccBI.Dirty = true;
- if (SuccIdx <= Idx)
+ if (SuccBII <= BII)
Repeat = true;
}
}
@@ -1612,13 +1632,15 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
// TODO: Could insert earlier and schedule more liberally with operations
// that only use caller preserved registers.
MachineBasicBlock &EntryBB = MF.front();
+ MachineBasicBlock::iterator I = EntryBB.begin();
+ for (MachineBasicBlock::iterator E = EntryBB.end();
+ I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
+ ;
+ BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
if (ST->hasVscnt())
- BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(),
- TII->get(AMDGPU::S_WAITCNT_VSCNT))
- .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
- .addImm(0);
- BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
- .addImm(0);
+ BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(0);
Modified = true;
}