diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp | 79 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/CaymanInstructions.td | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/EvergreenInstructions.td | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/R600Instructions.td | 7 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInsertSkips.cpp | 5 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp | 10 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp | 158 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 2 |
10 files changed, 90 insertions, 183 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index fbed51de0ea49..a55a1747cafe6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -156,9 +156,6 @@ extern char &SIWholeQuadModeID; void initializeSILowerControlFlowPass(PassRegistry &); extern char &SILowerControlFlowID; -void initializeSIRemoveShortExecBranchesPass(PassRegistry &); -extern char &SIRemoveShortExecBranchesID; - void initializeSIInsertSkipsPass(PassRegistry &); extern char &SIInsertSkipsPassID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index eb30d659bf0b5..c8dc6f6e3bf4c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -228,7 +228,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSIModeRegisterPass(*PR); initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); - initializeSIRemoveShortExecBranchesPass(*PR); initializeSIInsertSkipsPass(*PR); initializeSIMemoryLegalizerPass(*PR); initializeSIOptimizeExecMaskingPass(*PR); @@ -994,7 +993,6 @@ void GCNPassConfig::addPreEmitPass() { // be better for it to emit S_NOP <N> when possible. addPass(&PostRAHazardRecognizerID); - addPass(&SIRemoveShortExecBranchesID); addPass(&SIInsertSkipsPassID); addPass(&BranchRelaxationPassID); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index 191f603a66d6a..01bb60f07f2ec 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -34,6 +34,7 @@ #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Type.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -117,24 +118,58 @@ static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA, return true; } +static void removeDoneExport(Function &F) { + ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext()); + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) { + if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) { + Intrin->setArgOperand(6, BoolFalse); // done + } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) { + Intrin->setArgOperand(4, BoolFalse); // done + } + } + } + } +} + static BasicBlock *unifyReturnBlockSet(Function &F, ArrayRef<BasicBlock *> ReturningBlocks, + bool InsertExport, const TargetTransformInfo &TTI, StringRef Name) { // Otherwise, we need to insert a new basic block into the function, add a PHI // nodes (if the function returns values), and convert all of the return // instructions into unconditional branches. BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F); + IRBuilder<> B(NewRetBlock); + + if (InsertExport) { + // Ensure that there's only one "done" export in the shader by removing the + // "done" bit set on the original final export. More than one "done" export + // can lead to undefined behavior. + removeDoneExport(F); + + Value *Undef = UndefValue::get(B.getFloatTy()); + B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() }, + { + B.getInt32(9), // target, SQ_EXP_NULL + B.getInt32(0), // enabled channels + Undef, Undef, Undef, Undef, // values + B.getTrue(), // done + B.getTrue(), // valid mask + }); + } PHINode *PN = nullptr; if (F.getReturnType()->isVoidTy()) { - ReturnInst::Create(F.getContext(), nullptr, NewRetBlock); + B.CreateRetVoid(); } else { // If the function doesn't return void... add a PHI node to the block... - PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(), - "UnifiedRetVal"); - NewRetBlock->getInstList().push_back(PN); - ReturnInst::Create(F.getContext(), PN, NewRetBlock); + PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(), + "UnifiedRetVal"); + assert(!InsertExport); + B.CreateRet(PN); } // Loop over all of the blocks, replacing the return instruction with an @@ -173,6 +208,8 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { // Dummy return block for infinite loop. BasicBlock *DummyReturnBB = nullptr; + bool InsertExport = false; + for (BasicBlock *BB : PDT.getRoots()) { if (isa<ReturnInst>(BB->getTerminator())) { if (!isUniformlyReached(DA, *BB)) @@ -188,6 +225,36 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { "DummyReturnBlock", &F); Type *RetTy = F.getReturnType(); Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy); + + // For pixel shaders, the producer guarantees that an export is + // executed before each return instruction. However, if there is an + // infinite loop and we insert a return ourselves, we need to uphold + // that guarantee by inserting a null export. This can happen e.g. in + // an infinite loop with kill instructions, which is supposed to + // terminate. However, we don't need to do this if there is a non-void + // return value, since then there is an epilog afterwards which will + // still export. + // + // Note: In the case where only some threads enter the infinite loop, + // this can result in the null export happening redundantly after the + // original exports. However, The last "real" export happens after all + // the threads that didn't enter an infinite loop converged, which + // means that the only extra threads to execute the null export are + // threads that entered the infinite loop, and they only could've + // exited through being killed which sets their exec bit to 0. + // Therefore, unless there's an actual infinite loop, which can have + // invalid results, or there's a kill after the last export, which we + // assume the frontend won't do, this export will have the same exec + // mask as the last "real" export, and therefore the valid mask will be + // overwritten with the same value and will still be correct. Also, + // even though this forces an extra unnecessary export wait, we assume + // that this happens rare enough in practice to that we don't have to + // worry about performance. + if (F.getCallingConv() == CallingConv::AMDGPU_PS && + RetTy->isVoidTy()) { + InsertExport = true; + } + ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); ReturningBlocks.push_back(DummyReturnBB); } @@ -260,6 +327,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - unifyReturnBlockSet(F, ReturningBlocks, TTI, "UnifiedReturnBlock"); + unifyReturnBlockSet(F, ReturningBlocks, InsertExport, TTI, "UnifiedReturnBlock"); return true; } diff --git a/llvm/lib/Target/AMDGPU/CaymanInstructions.td b/llvm/lib/Target/AMDGPU/CaymanInstructions.td index 1a526675164a0..e2978624811d1 100644 --- a/llvm/lib/Target/AMDGPU/CaymanInstructions.td +++ b/llvm/lib/Target/AMDGPU/CaymanInstructions.td @@ -50,6 +50,8 @@ def COS_cm : COS_Common<0x8E>; def : RsqPat<RECIPSQRT_IEEE_cm, f32>; +def : SqrtPat<RECIPSQRT_IEEE_cm, RECIP_IEEE_cm>; + def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>; defm DIV_cm : DIV_Common<RECIP_IEEE_cm>; @@ -70,8 +72,6 @@ def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> { -def : R600Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>; - class RAT_STORE_DWORD <RegisterClass rc, ValueType vt, bits<4> mask> : CF_MEM_RAT_CACHELESS <0x14, 0, mask, (ins rc:$rw_gpr, R600_TReg32_X:$index_gpr), diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td index 792e26d21f98d..88e554ae0bcc2 100644 --- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td @@ -118,11 +118,12 @@ def LOG_IEEE_eg : LOG_IEEE_Common<0x83>; def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>; def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>; def : RsqPat<RECIPSQRT_IEEE_eg, f32>; +def : SqrtPat<RECIPSQRT_IEEE_eg, RECIP_IEEE_eg>; + def SIN_eg : SIN_Common<0x8D>; def COS_eg : COS_Common<0x8E>; def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>; -def : EGPat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>; } // End SubtargetPredicate = isEG //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/R600Instructions.td b/llvm/lib/Target/AMDGPU/R600Instructions.td index cbdf0de44f873..869c183e2245b 100644 --- a/llvm/lib/Target/AMDGPU/R600Instructions.td +++ b/llvm/lib/Target/AMDGPU/R600Instructions.td @@ -1233,6 +1233,11 @@ def : R600Pat< def : RcpPat<recip_ieee, f32>; } +class SqrtPat<Instruction RsqInst, Instruction RecipInst> : R600Pat < + (fsqrt f32:$src), + (RecipInst (RsqInst $src)) +>; + //===----------------------------------------------------------------------===// // R600 / R700 Instructions //===----------------------------------------------------------------------===// @@ -1272,8 +1277,8 @@ let Predicates = [isR600] in { defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>; def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>; - def : R600Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>; def : RsqPat<RECIPSQRT_IEEE_r600, f32>; + def : SqrtPat<RECIPSQRT_IEEE_r600, RECIP_IEEE_r600>; def R600_ExportSwz : ExportSwzInst { let Word1{20-17} = 0; // BURST_COUNT diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp index 80c044ec00cb3..87e63fcc4a04f 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -41,7 +41,7 @@ using namespace llvm; #define DEBUG_TYPE "si-insert-skips" static cl::opt<unsigned> SkipThresholdFlag( - "amdgpu-skip-threshold-legacy", + "amdgpu-skip-threshold", cl::desc("Number of instructions before jumping over divergent control flow"), cl::init(12), cl::Hidden); @@ -466,9 +466,6 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { MachineInstr &MI = *I; switch (MI.getOpcode()) { - case AMDGPU::S_CBRANCH_EXECZ: - ExecBranchStack.push_back(MI.getOperand(0).getMBB()); - break; case AMDGPU::SI_MASK_BRANCH: ExecBranchStack.push_back(MI.getOperand(0).getMBB()); MadeChange |= skipMaskBranch(MI, MBB); diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 61d2719a3aad6..bf052dc3c9304 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -244,9 +244,9 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { BuildMI(MBB, I, DL, TII->get(MovTermOpc), Exec) .addReg(Tmp, RegState::Kill); - // Insert the S_CBRANCH_EXECZ instruction which will be optimized later - // during SIRemoveShortExecBranches. - MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) + // Insert a pseudo terminator to help keep the verifier happy. This will also + // be used later when inserting skips. + MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) .add(MI.getOperand(2)); if (!LIS) { @@ -323,8 +323,8 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { .addReg(DstReg); MachineInstr *Branch = - BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) - .addMBB(DestBB); + BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) + .addMBB(DestBB); if (!LIS) { MI.eraseFromParent(); diff --git a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp b/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp deleted file mode 100644 index 51779e97ac620..0000000000000 --- a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp +++ /dev/null @@ -1,158 +0,0 @@ -//===-- SIRemoveShortExecBranches.cpp ------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This pass optmizes the s_cbranch_execz instructions. -/// The pass removes this skip instruction for short branches, -/// if there is no unwanted sideeffect in the fallthrough code sequence. -/// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "SIInstrInfo.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/Support/CommandLine.h" - -using namespace llvm; - -#define DEBUG_TYPE "si-remove-short-exec-branches" - -static unsigned SkipThreshold; - -static cl::opt<unsigned, true> SkipThresholdFlag( - "amdgpu-skip-threshold", cl::Hidden, - cl::desc( - "Number of instructions before jumping over divergent control flow"), - cl::location(SkipThreshold), cl::init(12)); - -namespace { - -class SIRemoveShortExecBranches : public MachineFunctionPass { -private: - const SIInstrInfo *TII = nullptr; - bool getBlockDestinations(MachineBasicBlock &SrcMBB, - MachineBasicBlock *&TrueMBB, - MachineBasicBlock *&FalseMBB, - SmallVectorImpl<MachineOperand> &Cond); - bool mustRetainExeczBranch(const MachineBasicBlock &From, - const MachineBasicBlock &To) const; - bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB); - -public: - static char ID; - - SIRemoveShortExecBranches() : MachineFunctionPass(ID) { - initializeSIRemoveShortExecBranchesPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; -}; - -} // End anonymous namespace. - -INITIALIZE_PASS(SIRemoveShortExecBranches, DEBUG_TYPE, - "SI remove short exec branches", false, false) - -char SIRemoveShortExecBranches::ID = 0; - -char &llvm::SIRemoveShortExecBranchesID = SIRemoveShortExecBranches::ID; - -bool SIRemoveShortExecBranches::getBlockDestinations( - MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB, - MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) { - if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond)) - return false; - - if (!FalseMBB) - FalseMBB = SrcMBB.getNextNode(); - - return true; -} - -bool SIRemoveShortExecBranches::mustRetainExeczBranch( - const MachineBasicBlock &From, const MachineBasicBlock &To) const { - unsigned NumInstr = 0; - const MachineFunction *MF = From.getParent(); - - for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end(); - MBBI != End && MBBI != ToI; ++MBBI) { - const MachineBasicBlock &MBB = *MBBI; - - for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { - // When a uniform loop is inside non-uniform control flow, the branch - // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken - // when EXEC = 0. We should skip the loop lest it becomes infinite. - if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ || - I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) - return true; - - if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) - return true; - - // These instructions are potentially expensive even if EXEC = 0. - if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) || - I->getOpcode() == AMDGPU::S_WAITCNT) - return true; - - ++NumInstr; - if (NumInstr >= SkipThreshold) - return true; - } - } - - return false; -} - -// Returns true if the skip branch instruction is removed. -bool SIRemoveShortExecBranches::removeExeczBranch(MachineInstr &MI, - MachineBasicBlock &SrcMBB) { - MachineBasicBlock *TrueMBB = nullptr; - MachineBasicBlock *FalseMBB = nullptr; - SmallVector<MachineOperand, 1> Cond; - - if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond)) - return false; - - // Consider only the forward branches. - if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) || - mustRetainExeczBranch(*FalseMBB, *TrueMBB)) - return false; - - LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI); - MI.eraseFromParent(); - SrcMBB.removeSuccessor(TrueMBB); - - return true; -} - -bool SIRemoveShortExecBranches::runOnMachineFunction(MachineFunction &MF) { - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - TII = ST.getInstrInfo(); - MF.RenumberBlocks(); - bool Changed = false; - - for (MachineBasicBlock &MBB : MF) { - MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); - if (MBBI == MBB.end()) - continue; - - MachineInstr &MI = *MBBI; - switch (MI.getOpcode()) { - case AMDGPU::S_CBRANCH_EXECZ: - Changed = removeExeczBranch(MI, MBB); - break; - default: - break; - } - } - - return Changed; -} diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 5271bc3aacc65..8b21b9346987f 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -559,7 +559,7 @@ bool isReadOnlySegment(const GlobalValue *GV) { } bool shouldEmitConstantsToTextSection(const Triple &TT) { - return TT.getOS() == Triple::AMDPAL; + return TT.getOS() == Triple::AMDPAL || TT.getArch() == Triple::r600; } int getIntegerAttribute(const Function &F, StringRef Name, int Default) { |