diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp | 373 |
1 files changed, 373 insertions, 0 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp new file mode 100644 index 000000000000..097722157d41 --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp @@ -0,0 +1,373 @@ +//===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This pass does attempts to make use of reqd_work_group_size metadata +/// to eliminate loads from the dispatch packet and to constant fold OpenCL +/// get_local_size-like functions. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Pass.h" + +#define DEBUG_TYPE "amdgpu-lower-kernel-attributes" + +using namespace llvm; + +namespace { + +// Field offsets in hsa_kernel_dispatch_packet_t. +enum DispatchPackedOffsets { + WORKGROUP_SIZE_X = 4, + WORKGROUP_SIZE_Y = 6, + WORKGROUP_SIZE_Z = 8, + + GRID_SIZE_X = 12, + GRID_SIZE_Y = 16, + GRID_SIZE_Z = 20 +}; + +// Field offsets to implicit kernel argument pointer. +enum ImplicitArgOffsets { + HIDDEN_BLOCK_COUNT_X = 0, + HIDDEN_BLOCK_COUNT_Y = 4, + HIDDEN_BLOCK_COUNT_Z = 8, + + HIDDEN_GROUP_SIZE_X = 12, + HIDDEN_GROUP_SIZE_Y = 14, + HIDDEN_GROUP_SIZE_Z = 16, + + HIDDEN_REMAINDER_X = 18, + HIDDEN_REMAINDER_Y = 20, + HIDDEN_REMAINDER_Z = 22, +}; + +class AMDGPULowerKernelAttributes : public ModulePass { +public: + static char ID; + + AMDGPULowerKernelAttributes() : ModulePass(ID) {} + + bool runOnModule(Module &M) override; + + StringRef getPassName() const override { + return "AMDGPU Kernel Attributes"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + } +}; + +Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) { + auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr + : Intrinsic::amdgcn_dispatch_ptr; + StringRef Name = Intrinsic::getName(IntrinsicId); + return M.getFunction(Name); +} + +} // end anonymous namespace + +static bool processUse(CallInst *CI, bool IsV5OrAbove) { + Function *F = CI->getParent()->getParent(); + + auto MD = F->getMetadata("reqd_work_group_size"); + const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3; + + const bool HasUniformWorkGroupSize = + F->getFnAttribute("uniform-work-group-size").getValueAsBool(); + + if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize) + return false; + + Value *BlockCounts[3] = {nullptr, nullptr, nullptr}; + Value *GroupSizes[3] = {nullptr, nullptr, nullptr}; + Value *Remainders[3] = {nullptr, nullptr, nullptr}; + Value *GridSizes[3] = {nullptr, nullptr, nullptr}; + + const DataLayout &DL = F->getParent()->getDataLayout(); + + // We expect to see several GEP users, casted to the appropriate type and + // loaded. + for (User *U : CI->users()) { + if (!U->hasOneUse()) + continue; + + int64_t Offset = 0; + auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr/DispatchPtr? + auto *BCI = dyn_cast<BitCastInst>(U); + if (!Load && !BCI) { + if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI) + continue; + Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP? + BCI = dyn_cast<BitCastInst>(*U->user_begin()); + } + + if (BCI) { + if (!BCI->hasOneUse()) + continue; + Load = dyn_cast<LoadInst>(*BCI->user_begin()); // Load from BCI? + } + + if (!Load || !Load->isSimple()) + continue; + + unsigned LoadSize = DL.getTypeStoreSize(Load->getType()); + + // TODO: Handle merged loads. + if (IsV5OrAbove) { // Base is ImplicitArgPtr. + switch (Offset) { + case HIDDEN_BLOCK_COUNT_X: + if (LoadSize == 4) + BlockCounts[0] = Load; + break; + case HIDDEN_BLOCK_COUNT_Y: + if (LoadSize == 4) + BlockCounts[1] = Load; + break; + case HIDDEN_BLOCK_COUNT_Z: + if (LoadSize == 4) + BlockCounts[2] = Load; + break; + case HIDDEN_GROUP_SIZE_X: + if (LoadSize == 2) + GroupSizes[0] = Load; + break; + case HIDDEN_GROUP_SIZE_Y: + if (LoadSize == 2) + GroupSizes[1] = Load; + break; + case HIDDEN_GROUP_SIZE_Z: + if (LoadSize == 2) + GroupSizes[2] = Load; + break; + case HIDDEN_REMAINDER_X: + if (LoadSize == 2) + Remainders[0] = Load; + break; + case HIDDEN_REMAINDER_Y: + if (LoadSize == 2) + Remainders[1] = Load; + break; + case HIDDEN_REMAINDER_Z: + if (LoadSize == 2) + Remainders[2] = Load; + break; + default: + break; + } + } else { // Base is DispatchPtr. + switch (Offset) { + case WORKGROUP_SIZE_X: + if (LoadSize == 2) + GroupSizes[0] = Load; + break; + case WORKGROUP_SIZE_Y: + if (LoadSize == 2) + GroupSizes[1] = Load; + break; + case WORKGROUP_SIZE_Z: + if (LoadSize == 2) + GroupSizes[2] = Load; + break; + case GRID_SIZE_X: + if (LoadSize == 4) + GridSizes[0] = Load; + break; + case GRID_SIZE_Y: + if (LoadSize == 4) + GridSizes[1] = Load; + break; + case GRID_SIZE_Z: + if (LoadSize == 4) + GridSizes[2] = Load; + break; + default: + break; + } + } + } + + bool MadeChange = false; + if (IsV5OrAbove && HasUniformWorkGroupSize) { + // Under v5 __ockl_get_local_size returns the value computed by the expression: + // + // workgroup_id < hidden_block_count ? hidden_group_size : hidden_remainder + // + // For functions with the attribute uniform-work-group-size=true. we can evaluate + // workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned + // for __ockl_get_local_size. + for (int I = 0; I < 3; ++I) { + Value *BlockCount = BlockCounts[I]; + if (!BlockCount) + continue; + + using namespace llvm::PatternMatch; + auto GroupIDIntrin = + I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() + : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() + : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>()); + + for (User *ICmp : BlockCount->users()) { + ICmpInst::Predicate Pred; + if (match(ICmp, m_ICmp(Pred, GroupIDIntrin, m_Specific(BlockCount)))) { + if (Pred != ICmpInst::ICMP_ULT) + continue; + ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType())); + MadeChange = true; + } + } + } + + // All remainders should be 0 with uniform work group size. + for (Value *Remainder : Remainders) { + if (!Remainder) + continue; + Remainder->replaceAllUsesWith(Constant::getNullValue(Remainder->getType())); + MadeChange = true; + } + } else if (HasUniformWorkGroupSize) { // Pre-V5. + // Pattern match the code used to handle partial workgroup dispatches in the + // library implementation of get_local_size, so the entire function can be + // constant folded with a known group size. + // + // uint r = grid_size - group_id * group_size; + // get_local_size = (r < group_size) ? r : group_size; + // + // If we have uniform-work-group-size (which is the default in OpenCL 1.2), + // the grid_size is required to be a multiple of group_size). In this case: + // + // grid_size - (group_id * group_size) < group_size + // -> + // grid_size < group_size + (group_id * group_size) + // + // (grid_size / group_size) < 1 + group_id + // + // grid_size / group_size is at least 1, so we can conclude the select + // condition is false (except for group_id == 0, where the select result is + // the same). + for (int I = 0; I < 3; ++I) { + Value *GroupSize = GroupSizes[I]; + Value *GridSize = GridSizes[I]; + if (!GroupSize || !GridSize) + continue; + + using namespace llvm::PatternMatch; + auto GroupIDIntrin = + I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() + : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() + : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>()); + + for (User *U : GroupSize->users()) { + auto *ZextGroupSize = dyn_cast<ZExtInst>(U); + if (!ZextGroupSize) + continue; + + for (User *UMin : ZextGroupSize->users()) { + if (match(UMin, + m_UMin(m_Sub(m_Specific(GridSize), + m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize))), + m_Specific(ZextGroupSize)))) { + if (HasReqdWorkGroupSize) { + ConstantInt *KnownSize + = mdconst::extract<ConstantInt>(MD->getOperand(I)); + UMin->replaceAllUsesWith(ConstantFoldIntegerCast( + KnownSize, UMin->getType(), false, DL)); + } else { + UMin->replaceAllUsesWith(ZextGroupSize); + } + + MadeChange = true; + } + } + } + } + } + + // If reqd_work_group_size is set, we can replace work group size with it. + if (!HasReqdWorkGroupSize) + return MadeChange; + + for (int I = 0; I < 3; I++) { + Value *GroupSize = GroupSizes[I]; + if (!GroupSize) + continue; + + ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I)); + GroupSize->replaceAllUsesWith( + ConstantFoldIntegerCast(KnownSize, GroupSize->getType(), false, DL)); + MadeChange = true; + } + + return MadeChange; +} + + +// TODO: Move makeLIDRangeMetadata usage into here. Seem to not get +// TargetPassConfig for subtarget. +bool AMDGPULowerKernelAttributes::runOnModule(Module &M) { + bool MadeChange = false; + bool IsV5OrAbove = AMDGPU::getCodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5; + Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove); + + if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used. + return false; + + SmallPtrSet<Instruction *, 4> HandledUses; + for (auto *U : BasePtr->users()) { + CallInst *CI = cast<CallInst>(U); + if (HandledUses.insert(CI).second) { + if (processUse(CI, IsV5OrAbove)) + MadeChange = true; + } + } + + return MadeChange; +} + + +INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE, + "AMDGPU Kernel Attributes", false, false) +INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE, + "AMDGPU Kernel Attributes", false, false) + +char AMDGPULowerKernelAttributes::ID = 0; + +ModulePass *llvm::createAMDGPULowerKernelAttributesPass() { + return new AMDGPULowerKernelAttributes(); +} + +PreservedAnalyses +AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) { + bool IsV5OrAbove = + AMDGPU::getCodeObjectVersion(*F.getParent()) >= AMDGPU::AMDHSA_COV5; + Function *BasePtr = getBasePtrIntrinsic(*F.getParent(), IsV5OrAbove); + + if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used. + return PreservedAnalyses::all(); + + for (Instruction &I : instructions(F)) { + if (CallInst *CI = dyn_cast<CallInst>(&I)) { + if (CI->getCalledFunction() == BasePtr) + processUse(CI, IsV5OrAbove); + } + } + + return PreservedAnalyses::all(); +} |