diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2023-04-14 21:41:27 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2023-06-22 18:20:56 +0000 |
commit | bdd1243df58e60e85101c09001d9812a789b6bc4 (patch) | |
tree | a1ce621c7301dd47ba2ddc3b8eaa63b441389481 /contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp | |
parent | 781624ca2d054430052c828ba8d2c2eaf2d733e7 (diff) | |
parent | e3b557809604d036af6e00c60f012c2025b59a5e (diff) |
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp | 324 |
1 files changed, 213 insertions, 111 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp index f5903b3afb81..56e5e0708492 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -41,6 +42,21 @@ enum DispatchPackedOffsets { GRID_SIZE_Z = 20 }; +// Field offsets to implicit kernel argument pointer. +enum ImplicitArgOffsets { + HIDDEN_BLOCK_COUNT_X = 0, + HIDDEN_BLOCK_COUNT_Y = 4, + HIDDEN_BLOCK_COUNT_Z = 8, + + HIDDEN_GROUP_SIZE_X = 12, + HIDDEN_GROUP_SIZE_Y = 14, + HIDDEN_GROUP_SIZE_Z = 16, + + HIDDEN_REMAINDER_X = 18, + HIDDEN_REMAINDER_Y = 20, + HIDDEN_REMAINDER_Z = 22, +}; + class AMDGPULowerKernelAttributes : public ModulePass { public: static char ID; @@ -58,9 +74,16 @@ public: } }; +Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) { + auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr + : Intrinsic::amdgcn_dispatch_ptr; + StringRef Name = Intrinsic::getName(IntrinsicId); + return M.getFunction(Name); +} + } // end anonymous namespace -static bool processUse(CallInst *CI) { +static bool processUse(CallInst *CI, bool IsV5OrAbove) { Function *F = CI->getParent()->getParent(); auto MD = F->getMetadata("reqd_work_group_size"); @@ -72,13 +95,10 @@ static bool processUse(CallInst *CI) { if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize) return false; - Value *WorkGroupSizeX = nullptr; - Value *WorkGroupSizeY = nullptr; - Value *WorkGroupSizeZ = nullptr; - - Value *GridSizeX = nullptr; - Value *GridSizeY = nullptr; - Value *GridSizeZ = nullptr; + Value *BlockCounts[3] = {nullptr, nullptr, nullptr}; + Value *GroupSizes[3] = {nullptr, nullptr, nullptr}; + Value *Remainders[3] = {nullptr, nullptr, nullptr}; + Value *GridSizes[3] = {nullptr, nullptr, nullptr}; const DataLayout &DL = F->getParent()->getDataLayout(); @@ -89,148 +109,230 @@ static bool processUse(CallInst *CI) { continue; int64_t Offset = 0; - if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI) - continue; + auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr/DispatchPtr? + auto *BCI = dyn_cast<BitCastInst>(U); + if (!Load && !BCI) { + if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI) + continue; + Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP? + BCI = dyn_cast<BitCastInst>(*U->user_begin()); + } - auto *BCI = dyn_cast<BitCastInst>(*U->user_begin()); - if (!BCI || !BCI->hasOneUse()) - continue; + if (BCI) { + if (!BCI->hasOneUse()) + continue; + Load = dyn_cast<LoadInst>(*BCI->user_begin()); // Load from BCI? + } - auto *Load = dyn_cast<LoadInst>(*BCI->user_begin()); if (!Load || !Load->isSimple()) continue; unsigned LoadSize = DL.getTypeStoreSize(Load->getType()); // TODO: Handle merged loads. - switch (Offset) { - case WORKGROUP_SIZE_X: - if (LoadSize == 2) - WorkGroupSizeX = Load; - break; - case WORKGROUP_SIZE_Y: - if (LoadSize == 2) - WorkGroupSizeY = Load; - break; - case WORKGROUP_SIZE_Z: - if (LoadSize == 2) - WorkGroupSizeZ = Load; - break; - case GRID_SIZE_X: - if (LoadSize == 4) - GridSizeX = Load; - break; - case GRID_SIZE_Y: - if (LoadSize == 4) - GridSizeY = Load; - break; - case GRID_SIZE_Z: - if (LoadSize == 4) - GridSizeZ = Load; - break; - default: - break; + if (IsV5OrAbove) { // Base is ImplicitArgPtr. + switch (Offset) { + case HIDDEN_BLOCK_COUNT_X: + if (LoadSize == 4) + BlockCounts[0] = Load; + break; + case HIDDEN_BLOCK_COUNT_Y: + if (LoadSize == 4) + BlockCounts[1] = Load; + break; + case HIDDEN_BLOCK_COUNT_Z: + if (LoadSize == 4) + BlockCounts[2] = Load; + break; + case HIDDEN_GROUP_SIZE_X: + if (LoadSize == 2) + GroupSizes[0] = Load; + break; + case HIDDEN_GROUP_SIZE_Y: + if (LoadSize == 2) + GroupSizes[1] = Load; + break; + case HIDDEN_GROUP_SIZE_Z: + if (LoadSize == 2) + GroupSizes[2] = Load; + break; + case HIDDEN_REMAINDER_X: + if (LoadSize == 2) + Remainders[0] = Load; + break; + case HIDDEN_REMAINDER_Y: + if (LoadSize == 2) + Remainders[1] = Load; + break; + case HIDDEN_REMAINDER_Z: + if (LoadSize == 2) + Remainders[2] = Load; + break; + default: + break; + } + } else { // Base is DispatchPtr. + switch (Offset) { + case WORKGROUP_SIZE_X: + if (LoadSize == 2) + GroupSizes[0] = Load; + break; + case WORKGROUP_SIZE_Y: + if (LoadSize == 2) + GroupSizes[1] = Load; + break; + case WORKGROUP_SIZE_Z: + if (LoadSize == 2) + GroupSizes[2] = Load; + break; + case GRID_SIZE_X: + if (LoadSize == 4) + GridSizes[0] = Load; + break; + case GRID_SIZE_Y: + if (LoadSize == 4) + GridSizes[1] = Load; + break; + case GRID_SIZE_Z: + if (LoadSize == 4) + GridSizes[2] = Load; + break; + default: + break; + } } } - // Pattern match the code used to handle partial workgroup dispatches in the - // library implementation of get_local_size, so the entire function can be - // constant folded with a known group size. - // - // uint r = grid_size - group_id * group_size; - // get_local_size = (r < group_size) ? r : group_size; - // - // If we have uniform-work-group-size (which is the default in OpenCL 1.2), - // the grid_size is required to be a multiple of group_size). In this case: - // - // grid_size - (group_id * group_size) < group_size - // -> - // grid_size < group_size + (group_id * group_size) - // - // (grid_size / group_size) < 1 + group_id - // - // grid_size / group_size is at least 1, so we can conclude the select - // condition is false (except for group_id == 0, where the select result is - // the same). - bool MadeChange = false; - Value *WorkGroupSizes[3] = { WorkGroupSizeX, WorkGroupSizeY, WorkGroupSizeZ }; - Value *GridSizes[3] = { GridSizeX, GridSizeY, GridSizeZ }; - - for (int I = 0; HasUniformWorkGroupSize && I < 3; ++I) { - Value *GroupSize = WorkGroupSizes[I]; - Value *GridSize = GridSizes[I]; - if (!GroupSize || !GridSize) - continue; + if (IsV5OrAbove && HasUniformWorkGroupSize) { + // Under v5 __ockl_get_local_size returns the value computed by the expression: + // + // workgroup_id < hidden_block_count ? hidden_group_size : hidden_remainder + // + // For functions with the attribute uniform-work-group-size=true. we can evaluate + // workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned + // for __ockl_get_local_size. + for (int I = 0; I < 3; ++I) { + Value *BlockCount = BlockCounts[I]; + if (!BlockCount) + continue; - using namespace llvm::PatternMatch; - auto GroupIDIntrin = - I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() - : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() - : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>()); + using namespace llvm::PatternMatch; + auto GroupIDIntrin = + I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() + : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() + : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>()); + + for (User *ICmp : BlockCount->users()) { + ICmpInst::Predicate Pred; + if (match(ICmp, m_ICmp(Pred, GroupIDIntrin, m_Specific(BlockCount)))) { + if (Pred != ICmpInst::ICMP_ULT) + continue; + ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType())); + MadeChange = true; + } + } + } - for (User *U : GroupSize->users()) { - auto *ZextGroupSize = dyn_cast<ZExtInst>(U); - if (!ZextGroupSize) + // All remainders should be 0 with uniform work group size. + for (Value *Remainder : Remainders) { + if (!Remainder) + continue; + Remainder->replaceAllUsesWith(Constant::getNullValue(Remainder->getType())); + MadeChange = true; + } + } else if (HasUniformWorkGroupSize) { // Pre-V5. + // Pattern match the code used to handle partial workgroup dispatches in the + // library implementation of get_local_size, so the entire function can be + // constant folded with a known group size. + // + // uint r = grid_size - group_id * group_size; + // get_local_size = (r < group_size) ? r : group_size; + // + // If we have uniform-work-group-size (which is the default in OpenCL 1.2), + // the grid_size is required to be a multiple of group_size). In this case: + // + // grid_size - (group_id * group_size) < group_size + // -> + // grid_size < group_size + (group_id * group_size) + // + // (grid_size / group_size) < 1 + group_id + // + // grid_size / group_size is at least 1, so we can conclude the select + // condition is false (except for group_id == 0, where the select result is + // the same). + for (int I = 0; I < 3; ++I) { + Value *GroupSize = GroupSizes[I]; + Value *GridSize = GridSizes[I]; + if (!GroupSize || !GridSize) continue; - for (User *UMin : ZextGroupSize->users()) { - if (match(UMin, - m_UMin(m_Sub(m_Specific(GridSize), - m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize))), - m_Specific(ZextGroupSize)))) { - if (HasReqdWorkGroupSize) { - ConstantInt *KnownSize - = mdconst::extract<ConstantInt>(MD->getOperand(I)); - UMin->replaceAllUsesWith(ConstantExpr::getIntegerCast( - KnownSize, UMin->getType(), false)); - } else { - UMin->replaceAllUsesWith(ZextGroupSize); + using namespace llvm::PatternMatch; + auto GroupIDIntrin = + I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() + : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() + : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>()); + + for (User *U : GroupSize->users()) { + auto *ZextGroupSize = dyn_cast<ZExtInst>(U); + if (!ZextGroupSize) + continue; + + for (User *UMin : ZextGroupSize->users()) { + if (match(UMin, + m_UMin(m_Sub(m_Specific(GridSize), + m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize))), + m_Specific(ZextGroupSize)))) { + if (HasReqdWorkGroupSize) { + ConstantInt *KnownSize + = mdconst::extract<ConstantInt>(MD->getOperand(I)); + UMin->replaceAllUsesWith(ConstantExpr::getIntegerCast( + KnownSize, UMin->getType(), false)); + } else { + UMin->replaceAllUsesWith(ZextGroupSize); + } + + MadeChange = true; } - - MadeChange = true; } } } } + // If reqd_work_group_size is set, we can replace work group size with it. if (!HasReqdWorkGroupSize) return MadeChange; - // Eliminate any other loads we can from the dispatch packet. - for (int I = 0; I < 3; ++I) { - Value *GroupSize = WorkGroupSizes[I]; + for (int I = 0; I < 3; I++) { + Value *GroupSize = GroupSizes[I]; if (!GroupSize) continue; ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I)); GroupSize->replaceAllUsesWith( - ConstantExpr::getIntegerCast(KnownSize, - GroupSize->getType(), - false)); + ConstantExpr::getIntegerCast(KnownSize, GroupSize->getType(), false)); MadeChange = true; } return MadeChange; } + // TODO: Move makeLIDRangeMetadata usage into here. Seem to not get // TargetPassConfig for subtarget. bool AMDGPULowerKernelAttributes::runOnModule(Module &M) { - StringRef DispatchPtrName - = Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr); + bool MadeChange = false; + bool IsV5OrAbove = AMDGPU::getAmdhsaCodeObjectVersion() >= 5; + Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove); - Function *DispatchPtr = M.getFunction(DispatchPtrName); - if (!DispatchPtr) // Dispatch ptr not used. + if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used. return false; - bool MadeChange = false; - SmallPtrSet<Instruction *, 4> HandledUses; - for (auto *U : DispatchPtr->users()) { + for (auto *U : BasePtr->users()) { CallInst *CI = cast<CallInst>(U); if (HandledUses.insert(CI).second) { - if (processUse(CI)) + if (processUse(CI, IsV5OrAbove)) MadeChange = true; } } @@ -238,6 +340,7 @@ bool AMDGPULowerKernelAttributes::runOnModule(Module &M) { return MadeChange; } + INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE, "AMDGPU Kernel Attributes", false, false) INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE, @@ -251,17 +354,16 @@ ModulePass *llvm::createAMDGPULowerKernelAttributesPass() { PreservedAnalyses AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) { - StringRef DispatchPtrName = - Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr); + bool IsV5OrAbove = AMDGPU::getAmdhsaCodeObjectVersion() >= 5; + Function *BasePtr = getBasePtrIntrinsic(*F.getParent(), IsV5OrAbove); - Function *DispatchPtr = F.getParent()->getFunction(DispatchPtrName); - if (!DispatchPtr) // Dispatch ptr not used. + if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used. return PreservedAnalyses::all(); for (Instruction &I : instructions(F)) { if (CallInst *CI = dyn_cast<CallInst>(&I)) { - if (CI->getCalledFunction() == DispatchPtr) - processUse(CI); + if (CI->getCalledFunction() == BasePtr) + processUse(CI, IsV5OrAbove); } } |