diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2020-07-26 19:36:28 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2020-07-26 19:36:28 +0000 |
commit | cfca06d7963fa0909f90483b42a6d7d194d01e08 (patch) | |
tree | 209fb2a2d68f8f277793fc8df46c753d31bc853b /llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp | |
parent | 706b4fc47bbc608932d3b491ae19a3b9cde9497b (diff) |
Notes
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp | 293 |
1 files changed, 220 insertions, 73 deletions
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 61d2719a3aad6..36d52ac3ee891 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -38,8 +38,8 @@ /// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0 // Do the IF block of the branch /// /// label0: -/// %sgpr0 = S_OR_SAVEEXEC_B64 %exec // Restore the exec mask for the Then block -/// %exec = S_XOR_B64 %sgpr0, %exec // Clear live bits from saved exec mask +/// %sgpr0 = S_OR_SAVEEXEC_B64 %sgpr0 // Restore the exec mask for the Then block +/// %exec = S_XOR_B64 %sgpr0, %exec // Update the exec mask /// S_BRANCH_EXECZ label1 // Use our branch optimization /// // instruction again. /// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr // Do the THEN block @@ -51,6 +51,8 @@ #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/LiveIntervals.h" @@ -73,6 +75,10 @@ using namespace llvm; #define DEBUG_TYPE "si-lower-control-flow" +static cl::opt<bool> +RemoveRedundantEndcf("amdgpu-remove-redundant-endcf", + cl::init(true), cl::ReallyHidden); + namespace { class SILowerControlFlow : public MachineFunctionPass { @@ -81,8 +87,12 @@ private: const SIInstrInfo *TII = nullptr; LiveIntervals *LIS = nullptr; MachineRegisterInfo *MRI = nullptr; + SetVector<MachineInstr*> LoweredEndCf; + DenseSet<Register> LoweredIf; + SmallSet<MachineInstr *, 16> NeedsKillCleanup; const TargetRegisterClass *BoolRC = nullptr; + bool InsertKillCleanups; unsigned AndOpc; unsigned OrOpc; unsigned XorOpc; @@ -98,13 +108,23 @@ private: void emitLoop(MachineInstr &MI); void emitEndCf(MachineInstr &MI); - Register getSaveExec(MachineInstr* MI); - void findMaskOperands(MachineInstr &MI, unsigned OpNo, SmallVectorImpl<MachineOperand> &Src) const; void combineMasks(MachineInstr &MI); + void process(MachineInstr &MI); + + // Skip to the next instruction, ignoring debug instructions, and trivial + // block boundaries (blocks that have one (typically fallthrough) successor, + // and the successor has one predecessor. + MachineBasicBlock::iterator + skipIgnoreExecInstsTrivialSucc(MachineBasicBlock &MBB, + MachineBasicBlock::iterator It) const; + + // Remove redundant SI_END_CF instructions. + void optimizeEndCf(); + public: static char ID; @@ -144,62 +164,44 @@ static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) { char &llvm::SILowerControlFlowID = SILowerControlFlow::ID; -static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI, - const SIInstrInfo *TII) { - Register SaveExecReg = MI.getOperand(0).getReg(); - auto U = MRI->use_instr_nodbg_begin(SaveExecReg); - - if (U == MRI->use_instr_nodbg_end() || - std::next(U) != MRI->use_instr_nodbg_end() || - U->getOpcode() != AMDGPU::SI_END_CF) - return false; - - // Check for SI_KILL_*_TERMINATOR on path from if to endif. - // if there is any such terminator simplififcations are not safe. - auto SMBB = MI.getParent(); - auto EMBB = U->getParent(); +static bool hasKill(const MachineBasicBlock *Begin, + const MachineBasicBlock *End, const SIInstrInfo *TII) { DenseSet<const MachineBasicBlock*> Visited; - SmallVector<MachineBasicBlock*, 4> Worklist(SMBB->succ_begin(), - SMBB->succ_end()); + SmallVector<MachineBasicBlock *, 4> Worklist(Begin->succ_begin(), + Begin->succ_end()); while (!Worklist.empty()) { MachineBasicBlock *MBB = Worklist.pop_back_val(); - if (MBB == EMBB || !Visited.insert(MBB).second) + if (MBB == End || !Visited.insert(MBB).second) continue; - for(auto &Term : MBB->terminators()) + for (auto &Term : MBB->terminators()) if (TII->isKillTerminator(Term.getOpcode())) - return false; + return true; Worklist.append(MBB->succ_begin(), MBB->succ_end()); } - return true; + return false; } -Register SILowerControlFlow::getSaveExec(MachineInstr *MI) { - MachineBasicBlock *MBB = MI->getParent(); - MachineOperand &SaveExec = MI->getOperand(0); - assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister); - - Register SaveExecReg = SaveExec.getReg(); - unsigned FalseTermOpc = - TII->isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; - MachineBasicBlock::iterator I = (MI); - MachineBasicBlock::iterator J = std::next(I); - if (J != MBB->end() && J->getOpcode() == FalseTermOpc && - J->getOperand(1).isReg() && J->getOperand(1).getReg() == SaveExecReg) { - SaveExecReg = J->getOperand(0).getReg(); - J->eraseFromParent(); - } - return SaveExecReg; +static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) { + Register SaveExecReg = MI.getOperand(0).getReg(); + auto U = MRI->use_instr_nodbg_begin(SaveExecReg); + + if (U == MRI->use_instr_nodbg_end() || + std::next(U) != MRI->use_instr_nodbg_end() || + U->getOpcode() != AMDGPU::SI_END_CF) + return false; + + return true; } void SILowerControlFlow::emitIf(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); MachineBasicBlock::iterator I(&MI); - Register SaveExecReg = getSaveExec(&MI); + Register SaveExecReg = MI.getOperand(0).getReg(); MachineOperand& Cond = MI.getOperand(1); assert(Cond.getSubReg() == AMDGPU::NoSubRegister); @@ -209,7 +211,35 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { // If there is only one use of save exec register and that use is SI_END_CF, // we can optimize SI_IF by returning the full saved exec mask instead of // just cleared bits. - bool SimpleIf = isSimpleIf(MI, MRI, TII); + bool SimpleIf = isSimpleIf(MI, MRI); + + if (InsertKillCleanups) { + // Check for SI_KILL_*_TERMINATOR on full path of control flow and + // flag the associated SI_END_CF for insertion of a kill cleanup. + auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg); + while (UseMI->getOpcode() != AMDGPU::SI_END_CF) { + assert(std::next(UseMI) == MRI->use_instr_nodbg_end()); + assert(UseMI->getOpcode() == AMDGPU::SI_ELSE); + MachineOperand &NextExec = UseMI->getOperand(0); + Register NextExecReg = NextExec.getReg(); + if (NextExec.isDead()) { + assert(!SimpleIf); + break; + } + UseMI = MRI->use_instr_nodbg_begin(NextExecReg); + } + if (UseMI->getOpcode() == AMDGPU::SI_END_CF) { + if (hasKill(MI.getParent(), UseMI->getParent(), TII)) { + NeedsKillCleanup.insert(&*UseMI); + SimpleIf = false; + } + } + } else if (SimpleIf) { + // Check for SI_KILL_*_TERMINATOR on path from if to endif. + // if there is any such terminator simplifications are not safe. + auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg); + SimpleIf = !hasKill(MI.getParent(), UseMI->getParent(), TII); + } // Add an implicit def of exec to discourage scheduling VALU after this which // will interfere with trying to form s_and_saveexec_b64 later. @@ -219,6 +249,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg) .addReg(Exec) .addReg(Exec, RegState::ImplicitDefine); + LoweredIf.insert(CopyReg); Register Tmp = MRI->createVirtualRegister(BoolRC); @@ -282,7 +313,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); - Register DstReg = getSaveExec(&MI); + Register DstReg = MI.getOperand(0).getReg(); bool ExecModified = MI.getOperand(3).getImm() != 0; MachineBasicBlock::iterator Start = MBB.begin(); @@ -354,7 +385,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); - auto Dst = getSaveExec(&MI); + auto Dst = MI.getOperand(0).getReg(); // Skip ANDing with exec if the break condition is already masked by exec // because it is a V_CMP in the same basic block. (We know the break @@ -416,6 +447,38 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) { MI.eraseFromParent(); } +MachineBasicBlock::iterator +SILowerControlFlow::skipIgnoreExecInstsTrivialSucc( + MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const { + + SmallSet<const MachineBasicBlock *, 4> Visited; + MachineBasicBlock *B = &MBB; + do { + if (!Visited.insert(B).second) + return MBB.end(); + + auto E = B->end(); + for ( ; It != E; ++It) { + if (It->getOpcode() == AMDGPU::SI_KILL_CLEANUP) + continue; + if (TII->mayReadEXEC(*MRI, *It)) + break; + } + + if (It != E) + return It; + + if (B->succ_size() != 1) + return MBB.end(); + + // If there is one trivial successor, advance to the next block. + MachineBasicBlock *Succ = *B->succ_begin(); + + It = Succ->begin(); + B = Succ; + } while (true); +} + void SILowerControlFlow::emitEndCf(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -430,8 +493,20 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) { .addReg(Exec) .add(MI.getOperand(0)); - if (LIS) + LoweredEndCf.insert(NewMI); + + // If this ends control flow which contains kills (as flagged in emitIf) + // then insert an SI_KILL_CLEANUP immediately following the exec mask + // manipulation. This can be lowered to early termination if appropriate. + MachineInstr *CleanUpMI = nullptr; + if (NeedsKillCleanup.count(&MI)) + CleanUpMI = BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_KILL_CLEANUP)); + + if (LIS) { LIS->ReplaceMachineInstrInMaps(MI, *NewMI); + if (CleanUpMI) + LIS->InsertMachineInstrInMaps(*CleanUpMI); + } MI.eraseFromParent(); @@ -494,6 +569,84 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) { MRI->getUniqueVRegDef(Reg)->eraseFromParent(); } +void SILowerControlFlow::optimizeEndCf() { + // If the only instruction immediately following this END_CF is an another + // END_CF in the only successor we can avoid emitting exec mask restore here. + if (!RemoveRedundantEndcf) + return; + + for (MachineInstr *MI : LoweredEndCf) { + MachineBasicBlock &MBB = *MI->getParent(); + auto Next = + skipIgnoreExecInstsTrivialSucc(MBB, std::next(MI->getIterator())); + if (Next == MBB.end() || !LoweredEndCf.count(&*Next)) + continue; + // Only skip inner END_CF if outer ENDCF belongs to SI_IF. + // If that belongs to SI_ELSE then saved mask has an inverted value. + Register SavedExec + = TII->getNamedOperand(*Next, AMDGPU::OpName::src1)->getReg(); + assert(SavedExec.isVirtual() && "Expected saved exec to be src1!"); + + const MachineInstr *Def = MRI->getUniqueVRegDef(SavedExec); + if (Def && LoweredIf.count(SavedExec)) { + LLVM_DEBUG(dbgs() << "Skip redundant "; MI->dump()); + if (LIS) + LIS->RemoveMachineInstrFromMaps(*MI); + MI->eraseFromParent(); + } + } +} + +void SILowerControlFlow::process(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + MachineBasicBlock::iterator I(MI); + MachineInstr *Prev = (I != MBB.begin()) ? &*(std::prev(I)) : nullptr; + + switch (MI.getOpcode()) { + case AMDGPU::SI_IF: + emitIf(MI); + break; + + case AMDGPU::SI_ELSE: + emitElse(MI); + break; + + case AMDGPU::SI_IF_BREAK: + emitIfBreak(MI); + break; + + case AMDGPU::SI_LOOP: + emitLoop(MI); + break; + + case AMDGPU::SI_END_CF: + emitEndCf(MI); + break; + + default: + assert(false && "Attempt to process unsupported instruction"); + break; + } + + MachineBasicBlock::iterator Next; + for (I = Prev ? Prev->getIterator() : MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + MachineInstr &MaskMI = *I; + switch (MaskMI.getOpcode()) { + case AMDGPU::S_AND_B64: + case AMDGPU::S_OR_B64: + case AMDGPU::S_AND_B32: + case AMDGPU::S_OR_B32: + // Cleanup bit manipulations on exec mask + combineMasks(MaskMI); + break; + default: + I = MBB.end(); + break; + } + } +} + bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); TII = ST.getInstrInfo(); @@ -503,6 +656,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { LIS = getAnalysisIfAvailable<LiveIntervals>(); MRI = &MF.getRegInfo(); BoolRC = TRI->getBoolRC(); + InsertKillCleanups = + MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS; if (ST.isWave32()) { AndOpc = AMDGPU::S_AND_B32; @@ -524,57 +679,49 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { Exec = AMDGPU::EXEC; } + SmallVector<MachineInstr *, 32> Worklist; + MachineFunction::iterator NextBB; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; BI = NextBB) { NextBB = std::next(BI); MachineBasicBlock &MBB = *BI; - MachineBasicBlock::iterator I, Next, Last; - - for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) { + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; switch (MI.getOpcode()) { case AMDGPU::SI_IF: - emitIf(MI); + process(MI); break; case AMDGPU::SI_ELSE: - emitElse(MI); - break; - case AMDGPU::SI_IF_BREAK: - emitIfBreak(MI); - break; - case AMDGPU::SI_LOOP: - emitLoop(MI); - break; - case AMDGPU::SI_END_CF: - emitEndCf(MI); + // Only build worklist if SI_IF instructions must be processed first. + if (InsertKillCleanups) + Worklist.push_back(&MI); + else + process(MI); break; - case AMDGPU::S_AND_B64: - case AMDGPU::S_OR_B64: - case AMDGPU::S_AND_B32: - case AMDGPU::S_OR_B32: - // Cleanup bit manipulations on exec mask - combineMasks(MI); - Last = I; - continue; - default: - Last = I; - continue; + break; } - - // Replay newly inserted code to combine masks - Next = (Last == MBB.end()) ? MBB.begin() : Last; } } + for (MachineInstr *MI : Worklist) + process(*MI); + + optimizeEndCf(); + + LoweredEndCf.clear(); + LoweredIf.clear(); + NeedsKillCleanup.clear(); + return true; } |