summaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp538
1 files changed, 538 insertions, 0 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
new file mode 100644
index 000000000000..87e63fcc4a04
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -0,0 +1,538 @@
+//===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass inserts branches on the 0 exec mask over divergent branches
+/// branches when it's expected that jumping over the untaken control flow will
+/// be cheaper than having every workitem no-op through it.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-insert-skips"
+
+static cl::opt<unsigned> SkipThresholdFlag(
+ "amdgpu-skip-threshold",
+ cl::desc("Number of instructions before jumping over divergent control flow"),
+ cl::init(12), cl::Hidden);
+
+namespace {
+
+class SIInsertSkips : public MachineFunctionPass {
+private:
+ const SIRegisterInfo *TRI = nullptr;
+ const SIInstrInfo *TII = nullptr;
+ unsigned SkipThreshold = 0;
+
+ bool shouldSkip(const MachineBasicBlock &From,
+ const MachineBasicBlock &To) const;
+
+ bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
+
+ void kill(MachineInstr &MI);
+
+ MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const;
+
+ bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
+
+ bool optimizeVccBranch(MachineInstr &MI) const;
+
+public:
+ static char ID;
+
+ SIInsertSkips() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "SI insert s_cbranch_execz instructions";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+char SIInsertSkips::ID = 0;
+
+INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
+ "SI insert s_cbranch_execz instructions", false, false)
+
+char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
+
+static bool opcodeEmitsNoInsts(const MachineInstr &MI) {
+ if (MI.isMetaInstruction())
+ return true;
+
+ // Handle target specific opcodes.
+ switch (MI.getOpcode()) {
+ case AMDGPU::SI_MASK_BRANCH:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
+ const MachineBasicBlock &To) const {
+ unsigned NumInstr = 0;
+ const MachineFunction *MF = From.getParent();
+
+ for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
+ MBBI != End && MBBI != ToI; ++MBBI) {
+ const MachineBasicBlock &MBB = *MBBI;
+
+ for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
+ NumInstr < SkipThreshold && I != E; ++I) {
+ if (opcodeEmitsNoInsts(*I))
+ continue;
+
+ // FIXME: Since this is required for correctness, this should be inserted
+ // during SILowerControlFlow.
+
+ // When a uniform loop is inside non-uniform control flow, the branch
+ // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
+ // when EXEC = 0. We should skip the loop lest it becomes infinite.
+ if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
+ I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
+ return true;
+
+ if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
+ return true;
+
+ // These instructions are potentially expensive even if EXEC = 0.
+ if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
+ I->getOpcode() == AMDGPU::S_WAITCNT)
+ return true;
+
+ ++NumInstr;
+ if (NumInstr >= SkipThreshold)
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction *MF = MBB.getParent();
+
+ if (MF->getFunction().getCallingConv() != CallingConv::AMDGPU_PS ||
+ !shouldSkip(MBB, MBB.getParent()->back()))
+ return false;
+
+ MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
+
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ // If the exec mask is non-zero, skip the next two instructions
+ BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+ .addMBB(&NextBB);
+
+ MachineBasicBlock::iterator Insert = SkipBB->begin();
+
+ // Exec mask is zero: Export to NULL target...
+ BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE))
+ .addImm(0x09) // V_008DFC_SQ_EXP_NULL
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addImm(1) // vm
+ .addImm(0) // compr
+ .addImm(0); // en
+
+ // ... and terminate wavefront.
+ BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
+
+ return true;
+}
+
+void SIInsertSkips::kill(MachineInstr &MI) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc DL = MI.getDebugLoc();
+
+ switch (MI.getOpcode()) {
+ case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: {
+ unsigned Opcode = 0;
+
+ // The opcodes are inverted because the inline immediate has to be
+ // the first operand, e.g. from "x < imm" to "imm > x"
+ switch (MI.getOperand(2).getImm()) {
+ case ISD::SETOEQ:
+ case ISD::SETEQ:
+ Opcode = AMDGPU::V_CMPX_EQ_F32_e64;
+ break;
+ case ISD::SETOGT:
+ case ISD::SETGT:
+ Opcode = AMDGPU::V_CMPX_LT_F32_e64;
+ break;
+ case ISD::SETOGE:
+ case ISD::SETGE:
+ Opcode = AMDGPU::V_CMPX_LE_F32_e64;
+ break;
+ case ISD::SETOLT:
+ case ISD::SETLT:
+ Opcode = AMDGPU::V_CMPX_GT_F32_e64;
+ break;
+ case ISD::SETOLE:
+ case ISD::SETLE:
+ Opcode = AMDGPU::V_CMPX_GE_F32_e64;
+ break;
+ case ISD::SETONE:
+ case ISD::SETNE:
+ Opcode = AMDGPU::V_CMPX_LG_F32_e64;
+ break;
+ case ISD::SETO:
+ Opcode = AMDGPU::V_CMPX_O_F32_e64;
+ break;
+ case ISD::SETUO:
+ Opcode = AMDGPU::V_CMPX_U_F32_e64;
+ break;
+ case ISD::SETUEQ:
+ Opcode = AMDGPU::V_CMPX_NLG_F32_e64;
+ break;
+ case ISD::SETUGT:
+ Opcode = AMDGPU::V_CMPX_NGE_F32_e64;
+ break;
+ case ISD::SETUGE:
+ Opcode = AMDGPU::V_CMPX_NGT_F32_e64;
+ break;
+ case ISD::SETULT:
+ Opcode = AMDGPU::V_CMPX_NLE_F32_e64;
+ break;
+ case ISD::SETULE:
+ Opcode = AMDGPU::V_CMPX_NLT_F32_e64;
+ break;
+ case ISD::SETUNE:
+ Opcode = AMDGPU::V_CMPX_NEQ_F32_e64;
+ break;
+ default:
+ llvm_unreachable("invalid ISD:SET cond code");
+ }
+
+ const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
+ if (ST.hasNoSdstCMPX())
+ Opcode = AMDGPU::getVCMPXNoSDstOp(Opcode);
+
+ assert(MI.getOperand(0).isReg());
+
+ if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
+ MI.getOperand(0).getReg())) {
+ Opcode = AMDGPU::getVOPe32(Opcode);
+ BuildMI(MBB, &MI, DL, TII->get(Opcode))
+ .add(MI.getOperand(1))
+ .add(MI.getOperand(0));
+ } else {
+ auto I = BuildMI(MBB, &MI, DL, TII->get(Opcode));
+ if (!ST.hasNoSdstCMPX())
+ I.addReg(AMDGPU::VCC, RegState::Define);
+
+ I.addImm(0) // src0 modifiers
+ .add(MI.getOperand(1))
+ .addImm(0) // src1 modifiers
+ .add(MI.getOperand(0));
+
+ I.addImm(0); // omod
+ }
+ break;
+ }
+ case AMDGPU::SI_KILL_I1_TERMINATOR: {
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ const MachineOperand &Op = MI.getOperand(0);
+ int64_t KillVal = MI.getOperand(1).getImm();
+ assert(KillVal == 0 || KillVal == -1);
+
+ // Kill all threads if Op0 is an immediate and equal to the Kill value.
+ if (Op.isImm()) {
+ int64_t Imm = Op.getImm();
+ assert(Imm == 0 || Imm == -1);
+
+ if (Imm == KillVal)
+ BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_MOV_B32
+ : AMDGPU::S_MOV_B64), Exec)
+ .addImm(0);
+ break;
+ }
+
+ unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
+ if (ST.isWave32())
+ Opcode = KillVal ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_AND_B32;
+ BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec)
+ .addReg(Exec)
+ .add(Op);
+ break;
+ }
+ default:
+ llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
+ }
+}
+
+MachineBasicBlock *SIInsertSkips::insertSkipBlock(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
+ MachineFunction *MF = MBB.getParent();
+
+ MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
+ MachineFunction::iterator MBBI(MBB);
+ ++MBBI;
+
+ MF->insert(MBBI, SkipBB);
+ MBB.addSuccessor(SkipBB);
+
+ return SkipBB;
+}
+
+// Returns true if a branch over the block was inserted.
+bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
+ MachineBasicBlock &SrcMBB) {
+ MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
+
+ if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
+ return false;
+
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
+
+ BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
+ .addMBB(DestBB);
+
+ return true;
+}
+
+bool SIInsertSkips::optimizeVccBranch(MachineInstr &MI) const {
+ // Match:
+ // sreg = -1
+ // vcc = S_AND_B64 exec, sreg
+ // S_CBRANCH_VCC[N]Z
+ // =>
+ // S_CBRANCH_EXEC[N]Z
+ bool Changed = false;
+ MachineBasicBlock &MBB = *MI.getParent();
+ const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
+ const bool IsWave32 = ST.isWave32();
+ const unsigned CondReg = TRI->getVCC();
+ const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+
+ MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
+ E = MBB.rend();
+ bool ReadsCond = false;
+ unsigned Threshold = 5;
+ for (++A ; A != E ; ++A) {
+ if (!--Threshold)
+ return false;
+ if (A->modifiesRegister(ExecReg, TRI))
+ return false;
+ if (A->modifiesRegister(CondReg, TRI)) {
+ if (!A->definesRegister(CondReg, TRI) || A->getOpcode() != And)
+ return false;
+ break;
+ }
+ ReadsCond |= A->readsRegister(CondReg, TRI);
+ }
+ if (A == E)
+ return false;
+
+ MachineOperand &Op1 = A->getOperand(1);
+ MachineOperand &Op2 = A->getOperand(2);
+ if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
+ TII->commuteInstruction(*A);
+ Changed = true;
+ }
+ if (Op1.getReg() != ExecReg)
+ return Changed;
+ if (Op2.isImm() && Op2.getImm() != -1)
+ return Changed;
+
+ unsigned SReg = AMDGPU::NoRegister;
+ if (Op2.isReg()) {
+ SReg = Op2.getReg();
+ auto M = std::next(A);
+ bool ReadsSreg = false;
+ for ( ; M != E ; ++M) {
+ if (M->definesRegister(SReg, TRI))
+ break;
+ if (M->modifiesRegister(SReg, TRI))
+ return Changed;
+ ReadsSreg |= M->readsRegister(SReg, TRI);
+ }
+ if (M == E ||
+ !M->isMoveImmediate() ||
+ !M->getOperand(1).isImm() ||
+ M->getOperand(1).getImm() != -1)
+ return Changed;
+ // First if sreg is only used in and instruction fold the immediate
+ // into that and.
+ if (!ReadsSreg && Op2.isKill()) {
+ A->getOperand(2).ChangeToImmediate(-1);
+ M->eraseFromParent();
+ }
+ }
+
+ if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&
+ MI.killsRegister(CondReg, TRI))
+ A->eraseFromParent();
+
+ bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
+ if (SReg == ExecReg) {
+ if (IsVCCZ) {
+ MI.eraseFromParent();
+ return true;
+ }
+ MI.setDesc(TII->get(AMDGPU::S_BRANCH));
+ } else {
+ MI.setDesc(TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ
+ : AMDGPU::S_CBRANCH_EXECNZ));
+ }
+
+ MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
+ MI.addImplicitDefUseOperands(*MBB.getParent());
+
+ return true;
+}
+
+bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ TII = ST.getInstrInfo();
+ TRI = &TII->getRegisterInfo();
+ SkipThreshold = SkipThresholdFlag;
+
+ bool HaveKill = false;
+ bool MadeChange = false;
+
+ // Track depth of exec mask, divergent branches.
+ SmallVector<MachineBasicBlock *, 16> ExecBranchStack;
+
+ MachineFunction::iterator NextBB;
+
+ MachineBasicBlock *EmptyMBBAtEnd = nullptr;
+
+ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+ BI != BE; BI = NextBB) {
+ NextBB = std::next(BI);
+ MachineBasicBlock &MBB = *BI;
+ bool HaveSkipBlock = false;
+
+ if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) {
+ // Reached convergence point for last divergent branch.
+ ExecBranchStack.pop_back();
+ }
+
+ if (HaveKill && ExecBranchStack.empty()) {
+ HaveKill = false;
+
+ // TODO: Insert skip if exec is 0?
+ }
+
+ MachineBasicBlock::iterator I, Next;
+ for (I = MBB.begin(); I != MBB.end(); I = Next) {
+ Next = std::next(I);
+
+ MachineInstr &MI = *I;
+
+ switch (MI.getOpcode()) {
+ case AMDGPU::SI_MASK_BRANCH:
+ ExecBranchStack.push_back(MI.getOperand(0).getMBB());
+ MadeChange |= skipMaskBranch(MI, MBB);
+ break;
+
+ case AMDGPU::S_BRANCH:
+ // Optimize out branches to the next block.
+ // FIXME: Shouldn't this be handled by BranchFolding?
+ if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
+ MI.eraseFromParent();
+ } else if (HaveSkipBlock) {
+ // Remove the given unconditional branch when a skip block has been
+ // inserted after the current one and let skip the two instructions
+ // performing the kill if the exec mask is non-zero.
+ MI.eraseFromParent();
+ }
+ break;
+
+ case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
+ case AMDGPU::SI_KILL_I1_TERMINATOR:
+ MadeChange = true;
+ kill(MI);
+
+ if (ExecBranchStack.empty()) {
+ if (NextBB != BE && skipIfDead(MI, *NextBB)) {
+ HaveSkipBlock = true;
+ NextBB = std::next(BI);
+ BE = MF.end();
+ }
+ } else {
+ HaveKill = true;
+ }
+
+ MI.eraseFromParent();
+ break;
+
+ case AMDGPU::SI_RETURN_TO_EPILOG:
+ // FIXME: Should move somewhere else
+ assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
+
+ // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
+ // because external bytecode will be appended at the end.
+ if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
+ // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at
+ // the end and jump there.
+ if (!EmptyMBBAtEnd) {
+ EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
+ MF.insert(MF.end(), EmptyMBBAtEnd);
+ }
+
+ MBB.addSuccessor(EmptyMBBAtEnd);
+ BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
+ .addMBB(EmptyMBBAtEnd);
+ I->eraseFromParent();
+ }
+ break;
+
+ case AMDGPU::S_CBRANCH_VCCZ:
+ case AMDGPU::S_CBRANCH_VCCNZ:
+ MadeChange |= optimizeVccBranch(MI);
+ break;
+
+ default:
+ break;
+ }
+ }
+ }
+
+ return MadeChange;
+}