diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp | 79 |
1 files changed, 73 insertions, 6 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index 191f603a66d6a..01bb60f07f2ec 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -34,6 +34,7 @@ #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Type.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -117,24 +118,58 @@ static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA, return true; } +static void removeDoneExport(Function &F) { + ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext()); + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) { + if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) { + Intrin->setArgOperand(6, BoolFalse); // done + } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) { + Intrin->setArgOperand(4, BoolFalse); // done + } + } + } + } +} + static BasicBlock *unifyReturnBlockSet(Function &F, ArrayRef<BasicBlock *> ReturningBlocks, + bool InsertExport, const TargetTransformInfo &TTI, StringRef Name) { // Otherwise, we need to insert a new basic block into the function, add a PHI // nodes (if the function returns values), and convert all of the return // instructions into unconditional branches. BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F); + IRBuilder<> B(NewRetBlock); + + if (InsertExport) { + // Ensure that there's only one "done" export in the shader by removing the + // "done" bit set on the original final export. More than one "done" export + // can lead to undefined behavior. + removeDoneExport(F); + + Value *Undef = UndefValue::get(B.getFloatTy()); + B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() }, + { + B.getInt32(9), // target, SQ_EXP_NULL + B.getInt32(0), // enabled channels + Undef, Undef, Undef, Undef, // values + B.getTrue(), // done + B.getTrue(), // valid mask + }); + } PHINode *PN = nullptr; if (F.getReturnType()->isVoidTy()) { - ReturnInst::Create(F.getContext(), nullptr, NewRetBlock); + B.CreateRetVoid(); } else { // If the function doesn't return void... add a PHI node to the block... - PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(), - "UnifiedRetVal"); - NewRetBlock->getInstList().push_back(PN); - ReturnInst::Create(F.getContext(), PN, NewRetBlock); + PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(), + "UnifiedRetVal"); + assert(!InsertExport); + B.CreateRet(PN); } // Loop over all of the blocks, replacing the return instruction with an @@ -173,6 +208,8 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { // Dummy return block for infinite loop. BasicBlock *DummyReturnBB = nullptr; + bool InsertExport = false; + for (BasicBlock *BB : PDT.getRoots()) { if (isa<ReturnInst>(BB->getTerminator())) { if (!isUniformlyReached(DA, *BB)) @@ -188,6 +225,36 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { "DummyReturnBlock", &F); Type *RetTy = F.getReturnType(); Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy); + + // For pixel shaders, the producer guarantees that an export is + // executed before each return instruction. However, if there is an + // infinite loop and we insert a return ourselves, we need to uphold + // that guarantee by inserting a null export. This can happen e.g. in + // an infinite loop with kill instructions, which is supposed to + // terminate. However, we don't need to do this if there is a non-void + // return value, since then there is an epilog afterwards which will + // still export. + // + // Note: In the case where only some threads enter the infinite loop, + // this can result in the null export happening redundantly after the + // original exports. However, The last "real" export happens after all + // the threads that didn't enter an infinite loop converged, which + // means that the only extra threads to execute the null export are + // threads that entered the infinite loop, and they only could've + // exited through being killed which sets their exec bit to 0. + // Therefore, unless there's an actual infinite loop, which can have + // invalid results, or there's a kill after the last export, which we + // assume the frontend won't do, this export will have the same exec + // mask as the last "real" export, and therefore the valid mask will be + // overwritten with the same value and will still be correct. Also, + // even though this forces an extra unnecessary export wait, we assume + // that this happens rare enough in practice to that we don't have to + // worry about performance. + if (F.getCallingConv() == CallingConv::AMDGPU_PS && + RetTy->isVoidTy()) { + InsertExport = true; + } + ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); ReturningBlocks.push_back(DummyReturnBB); } @@ -260,6 +327,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - unifyReturnBlockSet(F, ReturningBlocks, TTI, "UnifiedReturnBlock"); + unifyReturnBlockSet(F, ReturningBlocks, InsertExport, TTI, "UnifiedReturnBlock"); return true; } |