summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp79
-rw-r--r--llvm/lib/Target/AMDGPU/CaymanInstructions.td4
-rw-r--r--llvm/lib/Target/AMDGPU/EvergreenInstructions.td3
-rw-r--r--llvm/lib/Target/AMDGPU/R600Instructions.td7
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertSkips.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp158
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp2
10 files changed, 90 insertions, 183 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index fbed51de0ea49..a55a1747cafe6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -156,9 +156,6 @@ extern char &SIWholeQuadModeID;
void initializeSILowerControlFlowPass(PassRegistry &);
extern char &SILowerControlFlowID;
-void initializeSIRemoveShortExecBranchesPass(PassRegistry &);
-extern char &SIRemoveShortExecBranchesID;
-
void initializeSIInsertSkipsPass(PassRegistry &);
extern char &SIInsertSkipsPassID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index eb30d659bf0b5..c8dc6f6e3bf4c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -228,7 +228,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSIModeRegisterPass(*PR);
initializeSIWholeQuadModePass(*PR);
initializeSILowerControlFlowPass(*PR);
- initializeSIRemoveShortExecBranchesPass(*PR);
initializeSIInsertSkipsPass(*PR);
initializeSIMemoryLegalizerPass(*PR);
initializeSIOptimizeExecMaskingPass(*PR);
@@ -994,7 +993,6 @@ void GCNPassConfig::addPreEmitPass() {
// be better for it to emit S_NOP <N> when possible.
addPass(&PostRAHazardRecognizerID);
- addPass(&SIRemoveShortExecBranchesID);
addPass(&SIInsertSkipsPassID);
addPass(&BranchRelaxationPassID);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 191f603a66d6a..01bb60f07f2ec 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -34,6 +34,7 @@
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Type.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
@@ -117,24 +118,58 @@ static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA,
return true;
}
+static void removeDoneExport(Function &F) {
+ ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext());
+ for (BasicBlock &BB : F) {
+ for (Instruction &I : BB) {
+ if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) {
+ if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) {
+ Intrin->setArgOperand(6, BoolFalse); // done
+ } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) {
+ Intrin->setArgOperand(4, BoolFalse); // done
+ }
+ }
+ }
+ }
+}
+
static BasicBlock *unifyReturnBlockSet(Function &F,
ArrayRef<BasicBlock *> ReturningBlocks,
+ bool InsertExport,
const TargetTransformInfo &TTI,
StringRef Name) {
// Otherwise, we need to insert a new basic block into the function, add a PHI
// nodes (if the function returns values), and convert all of the return
// instructions into unconditional branches.
BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
+ IRBuilder<> B(NewRetBlock);
+
+ if (InsertExport) {
+ // Ensure that there's only one "done" export in the shader by removing the
+ // "done" bit set on the original final export. More than one "done" export
+ // can lead to undefined behavior.
+ removeDoneExport(F);
+
+ Value *Undef = UndefValue::get(B.getFloatTy());
+ B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() },
+ {
+ B.getInt32(9), // target, SQ_EXP_NULL
+ B.getInt32(0), // enabled channels
+ Undef, Undef, Undef, Undef, // values
+ B.getTrue(), // done
+ B.getTrue(), // valid mask
+ });
+ }
PHINode *PN = nullptr;
if (F.getReturnType()->isVoidTy()) {
- ReturnInst::Create(F.getContext(), nullptr, NewRetBlock);
+ B.CreateRetVoid();
} else {
// If the function doesn't return void... add a PHI node to the block...
- PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(),
- "UnifiedRetVal");
- NewRetBlock->getInstList().push_back(PN);
- ReturnInst::Create(F.getContext(), PN, NewRetBlock);
+ PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(),
+ "UnifiedRetVal");
+ assert(!InsertExport);
+ B.CreateRet(PN);
}
// Loop over all of the blocks, replacing the return instruction with an
@@ -173,6 +208,8 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
// Dummy return block for infinite loop.
BasicBlock *DummyReturnBB = nullptr;
+ bool InsertExport = false;
+
for (BasicBlock *BB : PDT.getRoots()) {
if (isa<ReturnInst>(BB->getTerminator())) {
if (!isUniformlyReached(DA, *BB))
@@ -188,6 +225,36 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
"DummyReturnBlock", &F);
Type *RetTy = F.getReturnType();
Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
+
+ // For pixel shaders, the producer guarantees that an export is
+ // executed before each return instruction. However, if there is an
+ // infinite loop and we insert a return ourselves, we need to uphold
+ // that guarantee by inserting a null export. This can happen e.g. in
+ // an infinite loop with kill instructions, which is supposed to
+ // terminate. However, we don't need to do this if there is a non-void
+ // return value, since then there is an epilog afterwards which will
+ // still export.
+ //
+ // Note: In the case where only some threads enter the infinite loop,
+ // this can result in the null export happening redundantly after the
+ // original exports. However, The last "real" export happens after all
+ // the threads that didn't enter an infinite loop converged, which
+ // means that the only extra threads to execute the null export are
+ // threads that entered the infinite loop, and they only could've
+ // exited through being killed which sets their exec bit to 0.
+ // Therefore, unless there's an actual infinite loop, which can have
+ // invalid results, or there's a kill after the last export, which we
+ // assume the frontend won't do, this export will have the same exec
+ // mask as the last "real" export, and therefore the valid mask will be
+ // overwritten with the same value and will still be correct. Also,
+ // even though this forces an extra unnecessary export wait, we assume
+ // that this happens rare enough in practice to that we don't have to
+ // worry about performance.
+ if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
+ RetTy->isVoidTy()) {
+ InsertExport = true;
+ }
+
ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
ReturningBlocks.push_back(DummyReturnBB);
}
@@ -260,6 +327,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
const TargetTransformInfo &TTI
= getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- unifyReturnBlockSet(F, ReturningBlocks, TTI, "UnifiedReturnBlock");
+ unifyReturnBlockSet(F, ReturningBlocks, InsertExport, TTI, "UnifiedReturnBlock");
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/CaymanInstructions.td b/llvm/lib/Target/AMDGPU/CaymanInstructions.td
index 1a526675164a0..e2978624811d1 100644
--- a/llvm/lib/Target/AMDGPU/CaymanInstructions.td
+++ b/llvm/lib/Target/AMDGPU/CaymanInstructions.td
@@ -50,6 +50,8 @@ def COS_cm : COS_Common<0x8E>;
def : RsqPat<RECIPSQRT_IEEE_cm, f32>;
+def : SqrtPat<RECIPSQRT_IEEE_cm, RECIP_IEEE_cm>;
+
def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;
defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
@@ -70,8 +72,6 @@ def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> {
-def : R600Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>;
-
class RAT_STORE_DWORD <RegisterClass rc, ValueType vt, bits<4> mask> :
CF_MEM_RAT_CACHELESS <0x14, 0, mask,
(ins rc:$rw_gpr, R600_TReg32_X:$index_gpr),
diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
index 792e26d21f98d..88e554ae0bcc2 100644
--- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -118,11 +118,12 @@ def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;
def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;
def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
def : RsqPat<RECIPSQRT_IEEE_eg, f32>;
+def : SqrtPat<RECIPSQRT_IEEE_eg, RECIP_IEEE_eg>;
+
def SIN_eg : SIN_Common<0x8D>;
def COS_eg : COS_Common<0x8E>;
def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>;
-def : EGPat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>;
} // End SubtargetPredicate = isEG
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/R600Instructions.td b/llvm/lib/Target/AMDGPU/R600Instructions.td
index cbdf0de44f873..869c183e2245b 100644
--- a/llvm/lib/Target/AMDGPU/R600Instructions.td
+++ b/llvm/lib/Target/AMDGPU/R600Instructions.td
@@ -1233,6 +1233,11 @@ def : R600Pat<
def : RcpPat<recip_ieee, f32>;
}
+class SqrtPat<Instruction RsqInst, Instruction RecipInst> : R600Pat <
+ (fsqrt f32:$src),
+ (RecipInst (RsqInst $src))
+>;
+
//===----------------------------------------------------------------------===//
// R600 / R700 Instructions
//===----------------------------------------------------------------------===//
@@ -1272,8 +1277,8 @@ let Predicates = [isR600] in {
defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>;
def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>;
- def : R600Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>;
def : RsqPat<RECIPSQRT_IEEE_r600, f32>;
+ def : SqrtPat<RECIPSQRT_IEEE_r600, RECIP_IEEE_r600>;
def R600_ExportSwz : ExportSwzInst {
let Word1{20-17} = 0; // BURST_COUNT
diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
index 80c044ec00cb3..87e63fcc4a04f 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -41,7 +41,7 @@ using namespace llvm;
#define DEBUG_TYPE "si-insert-skips"
static cl::opt<unsigned> SkipThresholdFlag(
- "amdgpu-skip-threshold-legacy",
+ "amdgpu-skip-threshold",
cl::desc("Number of instructions before jumping over divergent control flow"),
cl::init(12), cl::Hidden);
@@ -466,9 +466,6 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
MachineInstr &MI = *I;
switch (MI.getOpcode()) {
- case AMDGPU::S_CBRANCH_EXECZ:
- ExecBranchStack.push_back(MI.getOperand(0).getMBB());
- break;
case AMDGPU::SI_MASK_BRANCH:
ExecBranchStack.push_back(MI.getOperand(0).getMBB());
MadeChange |= skipMaskBranch(MI, MBB);
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 61d2719a3aad6..bf052dc3c9304 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -244,9 +244,9 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
BuildMI(MBB, I, DL, TII->get(MovTermOpc), Exec)
.addReg(Tmp, RegState::Kill);
- // Insert the S_CBRANCH_EXECZ instruction which will be optimized later
- // during SIRemoveShortExecBranches.
- MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
+ // Insert a pseudo terminator to help keep the verifier happy. This will also
+ // be used later when inserting skips.
+ MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
.add(MI.getOperand(2));
if (!LIS) {
@@ -323,8 +323,8 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
.addReg(DstReg);
MachineInstr *Branch =
- BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
- .addMBB(DestBB);
+ BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
+ .addMBB(DestBB);
if (!LIS) {
MI.eraseFromParent();
diff --git a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp b/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
deleted file mode 100644
index 51779e97ac620..0000000000000
--- a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-//===-- SIRemoveShortExecBranches.cpp ------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This pass optmizes the s_cbranch_execz instructions.
-/// The pass removes this skip instruction for short branches,
-/// if there is no unwanted sideeffect in the fallthrough code sequence.
-///
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/Support/CommandLine.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "si-remove-short-exec-branches"
-
-static unsigned SkipThreshold;
-
-static cl::opt<unsigned, true> SkipThresholdFlag(
- "amdgpu-skip-threshold", cl::Hidden,
- cl::desc(
- "Number of instructions before jumping over divergent control flow"),
- cl::location(SkipThreshold), cl::init(12));
-
-namespace {
-
-class SIRemoveShortExecBranches : public MachineFunctionPass {
-private:
- const SIInstrInfo *TII = nullptr;
- bool getBlockDestinations(MachineBasicBlock &SrcMBB,
- MachineBasicBlock *&TrueMBB,
- MachineBasicBlock *&FalseMBB,
- SmallVectorImpl<MachineOperand> &Cond);
- bool mustRetainExeczBranch(const MachineBasicBlock &From,
- const MachineBasicBlock &To) const;
- bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
-
-public:
- static char ID;
-
- SIRemoveShortExecBranches() : MachineFunctionPass(ID) {
- initializeSIRemoveShortExecBranchesPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS(SIRemoveShortExecBranches, DEBUG_TYPE,
- "SI remove short exec branches", false, false)
-
-char SIRemoveShortExecBranches::ID = 0;
-
-char &llvm::SIRemoveShortExecBranchesID = SIRemoveShortExecBranches::ID;
-
-bool SIRemoveShortExecBranches::getBlockDestinations(
- MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
- MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
- if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond))
- return false;
-
- if (!FalseMBB)
- FalseMBB = SrcMBB.getNextNode();
-
- return true;
-}
-
-bool SIRemoveShortExecBranches::mustRetainExeczBranch(
- const MachineBasicBlock &From, const MachineBasicBlock &To) const {
- unsigned NumInstr = 0;
- const MachineFunction *MF = From.getParent();
-
- for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
- MBBI != End && MBBI != ToI; ++MBBI) {
- const MachineBasicBlock &MBB = *MBBI;
-
- for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
- I != E; ++I) {
- // When a uniform loop is inside non-uniform control flow, the branch
- // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
- // when EXEC = 0. We should skip the loop lest it becomes infinite.
- if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
- I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
- return true;
-
- if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
- return true;
-
- // These instructions are potentially expensive even if EXEC = 0.
- if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
- I->getOpcode() == AMDGPU::S_WAITCNT)
- return true;
-
- ++NumInstr;
- if (NumInstr >= SkipThreshold)
- return true;
- }
- }
-
- return false;
-}
-
-// Returns true if the skip branch instruction is removed.
-bool SIRemoveShortExecBranches::removeExeczBranch(MachineInstr &MI,
- MachineBasicBlock &SrcMBB) {
- MachineBasicBlock *TrueMBB = nullptr;
- MachineBasicBlock *FalseMBB = nullptr;
- SmallVector<MachineOperand, 1> Cond;
-
- if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
- return false;
-
- // Consider only the forward branches.
- if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) ||
- mustRetainExeczBranch(*FalseMBB, *TrueMBB))
- return false;
-
- LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
- MI.eraseFromParent();
- SrcMBB.removeSuccessor(TrueMBB);
-
- return true;
-}
-
-bool SIRemoveShortExecBranches::runOnMachineFunction(MachineFunction &MF) {
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- TII = ST.getInstrInfo();
- MF.RenumberBlocks();
- bool Changed = false;
-
- for (MachineBasicBlock &MBB : MF) {
- MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
- if (MBBI == MBB.end())
- continue;
-
- MachineInstr &MI = *MBBI;
- switch (MI.getOpcode()) {
- case AMDGPU::S_CBRANCH_EXECZ:
- Changed = removeExeczBranch(MI, MBB);
- break;
- default:
- break;
- }
- }
-
- return Changed;
-}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 5271bc3aacc65..8b21b9346987f 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -559,7 +559,7 @@ bool isReadOnlySegment(const GlobalValue *GV) {
}
bool shouldEmitConstantsToTextSection(const Triple &TT) {
- return TT.getOS() == Triple::AMDPAL;
+ return TT.getOS() == Triple::AMDPAL || TT.getArch() == Triple::r600;
}
int getIntegerAttribute(const Function &F, StringRef Name, int Default) {