summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2020-07-26 19:36:28 +0000
committerDimitry Andric <dim@FreeBSD.org>2020-07-26 19:36:28 +0000
commitcfca06d7963fa0909f90483b42a6d7d194d01e08 (patch)
tree209fb2a2d68f8f277793fc8df46c753d31bc853b /llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
parent706b4fc47bbc608932d3b491ae19a3b9cde9497b (diff)
Notes
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp293
1 files changed, 220 insertions, 73 deletions
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 61d2719a3aad6..36d52ac3ee891 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -38,8 +38,8 @@
/// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0 // Do the IF block of the branch
///
/// label0:
-/// %sgpr0 = S_OR_SAVEEXEC_B64 %exec // Restore the exec mask for the Then block
-/// %exec = S_XOR_B64 %sgpr0, %exec // Clear live bits from saved exec mask
+/// %sgpr0 = S_OR_SAVEEXEC_B64 %sgpr0 // Restore the exec mask for the Then block
+/// %exec = S_XOR_B64 %sgpr0, %exec // Update the exec mask
/// S_BRANCH_EXECZ label1 // Use our branch optimization
/// // instruction again.
/// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr // Do the THEN block
@@ -51,6 +51,8 @@
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/LiveIntervals.h"
@@ -73,6 +75,10 @@ using namespace llvm;
#define DEBUG_TYPE "si-lower-control-flow"
+static cl::opt<bool>
+RemoveRedundantEndcf("amdgpu-remove-redundant-endcf",
+ cl::init(true), cl::ReallyHidden);
+
namespace {
class SILowerControlFlow : public MachineFunctionPass {
@@ -81,8 +87,12 @@ private:
const SIInstrInfo *TII = nullptr;
LiveIntervals *LIS = nullptr;
MachineRegisterInfo *MRI = nullptr;
+ SetVector<MachineInstr*> LoweredEndCf;
+ DenseSet<Register> LoweredIf;
+ SmallSet<MachineInstr *, 16> NeedsKillCleanup;
const TargetRegisterClass *BoolRC = nullptr;
+ bool InsertKillCleanups;
unsigned AndOpc;
unsigned OrOpc;
unsigned XorOpc;
@@ -98,13 +108,23 @@ private:
void emitLoop(MachineInstr &MI);
void emitEndCf(MachineInstr &MI);
- Register getSaveExec(MachineInstr* MI);
-
void findMaskOperands(MachineInstr &MI, unsigned OpNo,
SmallVectorImpl<MachineOperand> &Src) const;
void combineMasks(MachineInstr &MI);
+ void process(MachineInstr &MI);
+
+ // Skip to the next instruction, ignoring debug instructions, and trivial
+ // block boundaries (blocks that have one (typically fallthrough) successor,
+ // and the successor has one predecessor.
+ MachineBasicBlock::iterator
+ skipIgnoreExecInstsTrivialSucc(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator It) const;
+
+ // Remove redundant SI_END_CF instructions.
+ void optimizeEndCf();
+
public:
static char ID;
@@ -144,62 +164,44 @@ static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) {
char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
-static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI,
- const SIInstrInfo *TII) {
- Register SaveExecReg = MI.getOperand(0).getReg();
- auto U = MRI->use_instr_nodbg_begin(SaveExecReg);
-
- if (U == MRI->use_instr_nodbg_end() ||
- std::next(U) != MRI->use_instr_nodbg_end() ||
- U->getOpcode() != AMDGPU::SI_END_CF)
- return false;
-
- // Check for SI_KILL_*_TERMINATOR on path from if to endif.
- // if there is any such terminator simplififcations are not safe.
- auto SMBB = MI.getParent();
- auto EMBB = U->getParent();
+static bool hasKill(const MachineBasicBlock *Begin,
+ const MachineBasicBlock *End, const SIInstrInfo *TII) {
DenseSet<const MachineBasicBlock*> Visited;
- SmallVector<MachineBasicBlock*, 4> Worklist(SMBB->succ_begin(),
- SMBB->succ_end());
+ SmallVector<MachineBasicBlock *, 4> Worklist(Begin->succ_begin(),
+ Begin->succ_end());
while (!Worklist.empty()) {
MachineBasicBlock *MBB = Worklist.pop_back_val();
- if (MBB == EMBB || !Visited.insert(MBB).second)
+ if (MBB == End || !Visited.insert(MBB).second)
continue;
- for(auto &Term : MBB->terminators())
+ for (auto &Term : MBB->terminators())
if (TII->isKillTerminator(Term.getOpcode()))
- return false;
+ return true;
Worklist.append(MBB->succ_begin(), MBB->succ_end());
}
- return true;
+ return false;
}
-Register SILowerControlFlow::getSaveExec(MachineInstr *MI) {
- MachineBasicBlock *MBB = MI->getParent();
- MachineOperand &SaveExec = MI->getOperand(0);
- assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister);
-
- Register SaveExecReg = SaveExec.getReg();
- unsigned FalseTermOpc =
- TII->isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
- MachineBasicBlock::iterator I = (MI);
- MachineBasicBlock::iterator J = std::next(I);
- if (J != MBB->end() && J->getOpcode() == FalseTermOpc &&
- J->getOperand(1).isReg() && J->getOperand(1).getReg() == SaveExecReg) {
- SaveExecReg = J->getOperand(0).getReg();
- J->eraseFromParent();
- }
- return SaveExecReg;
+static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) {
+ Register SaveExecReg = MI.getOperand(0).getReg();
+ auto U = MRI->use_instr_nodbg_begin(SaveExecReg);
+
+ if (U == MRI->use_instr_nodbg_end() ||
+ std::next(U) != MRI->use_instr_nodbg_end() ||
+ U->getOpcode() != AMDGPU::SI_END_CF)
+ return false;
+
+ return true;
}
void SILowerControlFlow::emitIf(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
MachineBasicBlock::iterator I(&MI);
- Register SaveExecReg = getSaveExec(&MI);
+ Register SaveExecReg = MI.getOperand(0).getReg();
MachineOperand& Cond = MI.getOperand(1);
assert(Cond.getSubReg() == AMDGPU::NoSubRegister);
@@ -209,7 +211,35 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
// If there is only one use of save exec register and that use is SI_END_CF,
// we can optimize SI_IF by returning the full saved exec mask instead of
// just cleared bits.
- bool SimpleIf = isSimpleIf(MI, MRI, TII);
+ bool SimpleIf = isSimpleIf(MI, MRI);
+
+ if (InsertKillCleanups) {
+ // Check for SI_KILL_*_TERMINATOR on full path of control flow and
+ // flag the associated SI_END_CF for insertion of a kill cleanup.
+ auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg);
+ while (UseMI->getOpcode() != AMDGPU::SI_END_CF) {
+ assert(std::next(UseMI) == MRI->use_instr_nodbg_end());
+ assert(UseMI->getOpcode() == AMDGPU::SI_ELSE);
+ MachineOperand &NextExec = UseMI->getOperand(0);
+ Register NextExecReg = NextExec.getReg();
+ if (NextExec.isDead()) {
+ assert(!SimpleIf);
+ break;
+ }
+ UseMI = MRI->use_instr_nodbg_begin(NextExecReg);
+ }
+ if (UseMI->getOpcode() == AMDGPU::SI_END_CF) {
+ if (hasKill(MI.getParent(), UseMI->getParent(), TII)) {
+ NeedsKillCleanup.insert(&*UseMI);
+ SimpleIf = false;
+ }
+ }
+ } else if (SimpleIf) {
+ // Check for SI_KILL_*_TERMINATOR on path from if to endif.
+ // if there is any such terminator simplifications are not safe.
+ auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg);
+ SimpleIf = !hasKill(MI.getParent(), UseMI->getParent(), TII);
+ }
// Add an implicit def of exec to discourage scheduling VALU after this which
// will interfere with trying to form s_and_saveexec_b64 later.
@@ -219,6 +249,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg)
.addReg(Exec)
.addReg(Exec, RegState::ImplicitDefine);
+ LoweredIf.insert(CopyReg);
Register Tmp = MRI->createVirtualRegister(BoolRC);
@@ -282,7 +313,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
- Register DstReg = getSaveExec(&MI);
+ Register DstReg = MI.getOperand(0).getReg();
bool ExecModified = MI.getOperand(3).getImm() != 0;
MachineBasicBlock::iterator Start = MBB.begin();
@@ -354,7 +385,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
- auto Dst = getSaveExec(&MI);
+ auto Dst = MI.getOperand(0).getReg();
// Skip ANDing with exec if the break condition is already masked by exec
// because it is a V_CMP in the same basic block. (We know the break
@@ -416,6 +447,38 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {
MI.eraseFromParent();
}
+MachineBasicBlock::iterator
+SILowerControlFlow::skipIgnoreExecInstsTrivialSucc(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const {
+
+ SmallSet<const MachineBasicBlock *, 4> Visited;
+ MachineBasicBlock *B = &MBB;
+ do {
+ if (!Visited.insert(B).second)
+ return MBB.end();
+
+ auto E = B->end();
+ for ( ; It != E; ++It) {
+ if (It->getOpcode() == AMDGPU::SI_KILL_CLEANUP)
+ continue;
+ if (TII->mayReadEXEC(*MRI, *It))
+ break;
+ }
+
+ if (It != E)
+ return It;
+
+ if (B->succ_size() != 1)
+ return MBB.end();
+
+ // If there is one trivial successor, advance to the next block.
+ MachineBasicBlock *Succ = *B->succ_begin();
+
+ It = Succ->begin();
+ B = Succ;
+ } while (true);
+}
+
void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -430,8 +493,20 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
.addReg(Exec)
.add(MI.getOperand(0));
- if (LIS)
+ LoweredEndCf.insert(NewMI);
+
+ // If this ends control flow which contains kills (as flagged in emitIf)
+ // then insert an SI_KILL_CLEANUP immediately following the exec mask
+ // manipulation. This can be lowered to early termination if appropriate.
+ MachineInstr *CleanUpMI = nullptr;
+ if (NeedsKillCleanup.count(&MI))
+ CleanUpMI = BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_KILL_CLEANUP));
+
+ if (LIS) {
LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
+ if (CleanUpMI)
+ LIS->InsertMachineInstrInMaps(*CleanUpMI);
+ }
MI.eraseFromParent();
@@ -494,6 +569,84 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) {
MRI->getUniqueVRegDef(Reg)->eraseFromParent();
}
+void SILowerControlFlow::optimizeEndCf() {
+ // If the only instruction immediately following this END_CF is an another
+ // END_CF in the only successor we can avoid emitting exec mask restore here.
+ if (!RemoveRedundantEndcf)
+ return;
+
+ for (MachineInstr *MI : LoweredEndCf) {
+ MachineBasicBlock &MBB = *MI->getParent();
+ auto Next =
+ skipIgnoreExecInstsTrivialSucc(MBB, std::next(MI->getIterator()));
+ if (Next == MBB.end() || !LoweredEndCf.count(&*Next))
+ continue;
+ // Only skip inner END_CF if outer ENDCF belongs to SI_IF.
+ // If that belongs to SI_ELSE then saved mask has an inverted value.
+ Register SavedExec
+ = TII->getNamedOperand(*Next, AMDGPU::OpName::src1)->getReg();
+ assert(SavedExec.isVirtual() && "Expected saved exec to be src1!");
+
+ const MachineInstr *Def = MRI->getUniqueVRegDef(SavedExec);
+ if (Def && LoweredIf.count(SavedExec)) {
+ LLVM_DEBUG(dbgs() << "Skip redundant "; MI->dump());
+ if (LIS)
+ LIS->RemoveMachineInstrFromMaps(*MI);
+ MI->eraseFromParent();
+ }
+ }
+}
+
+void SILowerControlFlow::process(MachineInstr &MI) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineBasicBlock::iterator I(MI);
+ MachineInstr *Prev = (I != MBB.begin()) ? &*(std::prev(I)) : nullptr;
+
+ switch (MI.getOpcode()) {
+ case AMDGPU::SI_IF:
+ emitIf(MI);
+ break;
+
+ case AMDGPU::SI_ELSE:
+ emitElse(MI);
+ break;
+
+ case AMDGPU::SI_IF_BREAK:
+ emitIfBreak(MI);
+ break;
+
+ case AMDGPU::SI_LOOP:
+ emitLoop(MI);
+ break;
+
+ case AMDGPU::SI_END_CF:
+ emitEndCf(MI);
+ break;
+
+ default:
+ assert(false && "Attempt to process unsupported instruction");
+ break;
+ }
+
+ MachineBasicBlock::iterator Next;
+ for (I = Prev ? Prev->getIterator() : MBB.begin(); I != MBB.end(); I = Next) {
+ Next = std::next(I);
+ MachineInstr &MaskMI = *I;
+ switch (MaskMI.getOpcode()) {
+ case AMDGPU::S_AND_B64:
+ case AMDGPU::S_OR_B64:
+ case AMDGPU::S_AND_B32:
+ case AMDGPU::S_OR_B32:
+ // Cleanup bit manipulations on exec mask
+ combineMasks(MaskMI);
+ break;
+ default:
+ I = MBB.end();
+ break;
+ }
+ }
+}
+
bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
@@ -503,6 +656,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
LIS = getAnalysisIfAvailable<LiveIntervals>();
MRI = &MF.getRegInfo();
BoolRC = TRI->getBoolRC();
+ InsertKillCleanups =
+ MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
if (ST.isWave32()) {
AndOpc = AMDGPU::S_AND_B32;
@@ -524,57 +679,49 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
Exec = AMDGPU::EXEC;
}
+ SmallVector<MachineInstr *, 32> Worklist;
+
MachineFunction::iterator NextBB;
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; BI = NextBB) {
NextBB = std::next(BI);
MachineBasicBlock &MBB = *BI;
- MachineBasicBlock::iterator I, Next, Last;
-
- for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) {
+ MachineBasicBlock::iterator I, Next;
+ for (I = MBB.begin(); I != MBB.end(); I = Next) {
Next = std::next(I);
MachineInstr &MI = *I;
switch (MI.getOpcode()) {
case AMDGPU::SI_IF:
- emitIf(MI);
+ process(MI);
break;
case AMDGPU::SI_ELSE:
- emitElse(MI);
- break;
-
case AMDGPU::SI_IF_BREAK:
- emitIfBreak(MI);
- break;
-
case AMDGPU::SI_LOOP:
- emitLoop(MI);
- break;
-
case AMDGPU::SI_END_CF:
- emitEndCf(MI);
+ // Only build worklist if SI_IF instructions must be processed first.
+ if (InsertKillCleanups)
+ Worklist.push_back(&MI);
+ else
+ process(MI);
break;
- case AMDGPU::S_AND_B64:
- case AMDGPU::S_OR_B64:
- case AMDGPU::S_AND_B32:
- case AMDGPU::S_OR_B32:
- // Cleanup bit manipulations on exec mask
- combineMasks(MI);
- Last = I;
- continue;
-
default:
- Last = I;
- continue;
+ break;
}
-
- // Replay newly inserted code to combine masks
- Next = (Last == MBB.end()) ? MBB.begin() : Last;
}
}
+ for (MachineInstr *MI : Worklist)
+ process(*MI);
+
+ optimizeEndCf();
+
+ LoweredEndCf.clear();
+ LoweredIf.clear();
+ NeedsKillCleanup.clear();
+
return true;
}