src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2023-04-14 21:41:27 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2023-06-22 18:20:56 +0000
commit	bdd1243df58e60e85101c09001d9812a789b6bc4 (patch)
tree	a1ce621c7301dd47ba2ddc3b8eaa63b441389481 /contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
parent	781624ca2d054430052c828ba8d2c2eaf2d733e7 (diff)
parent	e3b557809604d036af6e00c60f012c2025b59a5e (diff)

Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp')

-rw-r--r--

contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp

324

1 files changed, 213 insertions, 111 deletions

diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index f5903b3afb81..56e5e0708492 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp

@@ -13,6 +13,7 @@

//===----------------------------------------------------------------------===//

#include "AMDGPU.h"

+#include "Utils/AMDGPUBaseInfo.h"

#include "llvm/Analysis/ValueTracking.h"

#include "llvm/CodeGen/Passes.h"

#include "llvm/CodeGen/TargetPassConfig.h"

@@ -41,6 +42,21 @@ enum DispatchPackedOffsets {

GRID_SIZE_Z = 20

};

+// Field offsets to implicit kernel argument pointer.

+enum ImplicitArgOffsets {

+ HIDDEN_BLOCK_COUNT_X = 0,

+ HIDDEN_BLOCK_COUNT_Y = 4,

+ HIDDEN_BLOCK_COUNT_Z = 8,

+ HIDDEN_GROUP_SIZE_X = 12,

+ HIDDEN_GROUP_SIZE_Y = 14,

+ HIDDEN_GROUP_SIZE_Z = 16,

+ HIDDEN_REMAINDER_X = 18,

+ HIDDEN_REMAINDER_Y = 20,

+ HIDDEN_REMAINDER_Z = 22,

+};

class AMDGPULowerKernelAttributes : public ModulePass {

public:

static char ID;

@@ -58,9 +74,16 @@ public:

}

};

+Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {

+ auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr

+ : Intrinsic::amdgcn_dispatch_ptr;

+ StringRef Name = Intrinsic::getName(IntrinsicId);

+ return M.getFunction(Name);

} // end anonymous namespace

-static bool processUse(CallInst *CI) {

+static bool processUse(CallInst *CI, bool IsV5OrAbove) {

Function *F = CI->getParent()->getParent();

auto MD = F->getMetadata("reqd_work_group_size");

@@ -72,13 +95,10 @@ static bool processUse(CallInst *CI) {

if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize)

return false;

- Value *WorkGroupSizeX = nullptr;

- Value *WorkGroupSizeY = nullptr;

- Value *WorkGroupSizeZ = nullptr;

- Value *GridSizeX = nullptr;

- Value *GridSizeY = nullptr;

- Value *GridSizeZ = nullptr;

+ Value *BlockCounts[3] = {nullptr, nullptr, nullptr};

+ Value *GroupSizes[3] = {nullptr, nullptr, nullptr};

+ Value *Remainders[3] = {nullptr, nullptr, nullptr};

+ Value *GridSizes[3] = {nullptr, nullptr, nullptr};

const DataLayout &DL = F->getParent()->getDataLayout();

@@ -89,148 +109,230 @@ static bool processUse(CallInst *CI) {

continue;

int64_t Offset = 0;

- if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)

- continue;

+ auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr/DispatchPtr?

+ auto *BCI = dyn_cast<BitCastInst>(U);

+ if (!Load && !BCI) {

+ if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)

+ continue;

+ Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?

+ BCI = dyn_cast<BitCastInst>(*U->user_begin());

+ }

- auto *BCI = dyn_cast<BitCastInst>(*U->user_begin());

- if (!BCI || !BCI->hasOneUse())

- continue;

+ if (BCI) {

+ if (!BCI->hasOneUse())

+ continue;

+ Load = dyn_cast<LoadInst>(*BCI->user_begin()); // Load from BCI?

+ }

- auto *Load = dyn_cast<LoadInst>(*BCI->user_begin());

if (!Load || !Load->isSimple())

continue;

unsigned LoadSize = DL.getTypeStoreSize(Load->getType());

// TODO: Handle merged loads.

- switch (Offset) {

- case WORKGROUP_SIZE_X:

- if (LoadSize == 2)

- WorkGroupSizeX = Load;

- break;

- case WORKGROUP_SIZE_Y:

- if (LoadSize == 2)

- WorkGroupSizeY = Load;

- break;

- case WORKGROUP_SIZE_Z:

- if (LoadSize == 2)

- WorkGroupSizeZ = Load;

- break;

- case GRID_SIZE_X:

- if (LoadSize == 4)

- GridSizeX = Load;

- break;

- case GRID_SIZE_Y:

- if (LoadSize == 4)

- GridSizeY = Load;

- break;

- case GRID_SIZE_Z:

- if (LoadSize == 4)

- GridSizeZ = Load;

- break;

- default:

- break;

+ if (IsV5OrAbove) { // Base is ImplicitArgPtr.

+ switch (Offset) {

+ case HIDDEN_BLOCK_COUNT_X:

+ if (LoadSize == 4)

+ BlockCounts[0] = Load;

+ break;

+ case HIDDEN_BLOCK_COUNT_Y:

+ if (LoadSize == 4)

+ BlockCounts[1] = Load;

+ break;

+ case HIDDEN_BLOCK_COUNT_Z:

+ if (LoadSize == 4)

+ BlockCounts[2] = Load;

+ break;

+ case HIDDEN_GROUP_SIZE_X:

+ if (LoadSize == 2)

+ GroupSizes[0] = Load;

+ break;

+ case HIDDEN_GROUP_SIZE_Y:

+ if (LoadSize == 2)

+ GroupSizes[1] = Load;

+ break;

+ case HIDDEN_GROUP_SIZE_Z:

+ if (LoadSize == 2)

+ GroupSizes[2] = Load;

+ break;

+ case HIDDEN_REMAINDER_X:

+ if (LoadSize == 2)

+ Remainders[0] = Load;

+ break;

+ case HIDDEN_REMAINDER_Y:

+ if (LoadSize == 2)

+ Remainders[1] = Load;

+ break;

+ case HIDDEN_REMAINDER_Z:

+ if (LoadSize == 2)

+ Remainders[2] = Load;

+ break;

+ default:

+ break;

+ }

+ } else { // Base is DispatchPtr.

+ switch (Offset) {

+ case WORKGROUP_SIZE_X:

+ if (LoadSize == 2)

+ GroupSizes[0] = Load;

+ break;

+ case WORKGROUP_SIZE_Y:

+ if (LoadSize == 2)

+ GroupSizes[1] = Load;

+ break;

+ case WORKGROUP_SIZE_Z:

+ if (LoadSize == 2)

+ GroupSizes[2] = Load;

+ break;

+ case GRID_SIZE_X:

+ if (LoadSize == 4)

+ GridSizes[0] = Load;

+ break;

+ case GRID_SIZE_Y:

+ if (LoadSize == 4)

+ GridSizes[1] = Load;

+ break;

+ case GRID_SIZE_Z:

+ if (LoadSize == 4)

+ GridSizes[2] = Load;

+ break;

+ default:

+ break;

+ }

}

- // Pattern match the code used to handle partial workgroup dispatches in the

- // library implementation of get_local_size, so the entire function can be

- // constant folded with a known group size.

- //

- // uint r = grid_size - group_id * group_size;

- // get_local_size = (r < group_size) ? r : group_size;

- //

- // If we have uniform-work-group-size (which is the default in OpenCL 1.2),

- // the grid_size is required to be a multiple of group_size). In this case:

- //

- // grid_size - (group_id * group_size) < group_size

- // ->

- // grid_size < group_size + (group_id * group_size)

- //

- // (grid_size / group_size) < 1 + group_id

- //

- // grid_size / group_size is at least 1, so we can conclude the select

- // condition is false (except for group_id == 0, where the select result is

- // the same).

bool MadeChange = false;

- Value *WorkGroupSizes[3] = { WorkGroupSizeX, WorkGroupSizeY, WorkGroupSizeZ };

- Value *GridSizes[3] = { GridSizeX, GridSizeY, GridSizeZ };

- for (int I = 0; HasUniformWorkGroupSize && I < 3; ++I) {

- Value *GroupSize = WorkGroupSizes[I];

- Value *GridSize = GridSizes[I];

- if (!GroupSize || !GridSize)

- continue;

+ if (IsV5OrAbove && HasUniformWorkGroupSize) {

+ // Under v5 __ockl_get_local_size returns the value computed by the expression:

+ //

+ // workgroup_id < hidden_block_count ? hidden_group_size : hidden_remainder

+ //

+ // For functions with the attribute uniform-work-group-size=true. we can evaluate

+ // workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned

+ // for __ockl_get_local_size.

+ for (int I = 0; I < 3; ++I) {

+ Value *BlockCount = BlockCounts[I];

+ if (!BlockCount)

+ continue;

- using namespace llvm::PatternMatch;

- auto GroupIDIntrin =

- I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()

- : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()

- : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());

+ using namespace llvm::PatternMatch;

+ auto GroupIDIntrin =

+ I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()

+ : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()

+ : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());

+ for (User *ICmp : BlockCount->users()) {

+ ICmpInst::Predicate Pred;

+ if (match(ICmp, m_ICmp(Pred, GroupIDIntrin, m_Specific(BlockCount)))) {

+ if (Pred != ICmpInst::ICMP_ULT)

+ continue;

+ ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType()));

+ MadeChange = true;

+ }

- for (User *U : GroupSize->users()) {

- auto *ZextGroupSize = dyn_cast<ZExtInst>(U);

- if (!ZextGroupSize)

+ // All remainders should be 0 with uniform work group size.

+ for (Value *Remainder : Remainders) {

+ if (!Remainder)

+ continue;

+ Remainder->replaceAllUsesWith(Constant::getNullValue(Remainder->getType()));

+ MadeChange = true;

+ }

+ } else if (HasUniformWorkGroupSize) { // Pre-V5.

+ // Pattern match the code used to handle partial workgroup dispatches in the

+ // library implementation of get_local_size, so the entire function can be

+ // constant folded with a known group size.

+ //

+ // uint r = grid_size - group_id * group_size;

+ // get_local_size = (r < group_size) ? r : group_size;

+ //

+ // If we have uniform-work-group-size (which is the default in OpenCL 1.2),

+ // the grid_size is required to be a multiple of group_size). In this case:

+ //

+ // grid_size - (group_id * group_size) < group_size

+ // ->

+ // grid_size < group_size + (group_id * group_size)

+ //

+ // (grid_size / group_size) < 1 + group_id

+ //

+ // grid_size / group_size is at least 1, so we can conclude the select

+ // condition is false (except for group_id == 0, where the select result is

+ // the same).

+ for (int I = 0; I < 3; ++I) {

+ Value *GroupSize = GroupSizes[I];

+ Value *GridSize = GridSizes[I];

+ if (!GroupSize || !GridSize)

continue;

- for (User *UMin : ZextGroupSize->users()) {

- if (match(UMin,

- m_UMin(m_Sub(m_Specific(GridSize),

- m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize))),

- m_Specific(ZextGroupSize)))) {

- if (HasReqdWorkGroupSize) {

- ConstantInt *KnownSize

- = mdconst::extract<ConstantInt>(MD->getOperand(I));

- UMin->replaceAllUsesWith(ConstantExpr::getIntegerCast(

- KnownSize, UMin->getType(), false));

- } else {

- UMin->replaceAllUsesWith(ZextGroupSize);

+ using namespace llvm::PatternMatch;

+ auto GroupIDIntrin =

+ I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()

+ : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()

+ : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());

+ for (User *U : GroupSize->users()) {

+ auto *ZextGroupSize = dyn_cast<ZExtInst>(U);

+ if (!ZextGroupSize)

+ continue;

+ for (User *UMin : ZextGroupSize->users()) {

+ if (match(UMin,

+ m_UMin(m_Sub(m_Specific(GridSize),

+ m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize))),

+ m_Specific(ZextGroupSize)))) {

+ if (HasReqdWorkGroupSize) {

+ ConstantInt *KnownSize

+ = mdconst::extract<ConstantInt>(MD->getOperand(I));

+ UMin->replaceAllUsesWith(ConstantExpr::getIntegerCast(

+ KnownSize, UMin->getType(), false));

+ } else {

+ UMin->replaceAllUsesWith(ZextGroupSize);

+ }

+ MadeChange = true;

}

- MadeChange = true;

}

+ // If reqd_work_group_size is set, we can replace work group size with it.

if (!HasReqdWorkGroupSize)

return MadeChange;

- // Eliminate any other loads we can from the dispatch packet.

- for (int I = 0; I < 3; ++I) {

- Value *GroupSize = WorkGroupSizes[I];

+ for (int I = 0; I < 3; I++) {

+ Value *GroupSize = GroupSizes[I];

if (!GroupSize)

continue;

ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));

GroupSize->replaceAllUsesWith(

- ConstantExpr::getIntegerCast(KnownSize,

- GroupSize->getType(),

- false));

+ ConstantExpr::getIntegerCast(KnownSize, GroupSize->getType(), false));

MadeChange = true;

}

return MadeChange;

}

// TODO: Move makeLIDRangeMetadata usage into here. Seem to not get

// TargetPassConfig for subtarget.

bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {

- StringRef DispatchPtrName

- = Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr);

+ bool MadeChange = false;

+ bool IsV5OrAbove = AMDGPU::getAmdhsaCodeObjectVersion() >= 5;

+ Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove);

- Function *DispatchPtr = M.getFunction(DispatchPtrName);

- if (!DispatchPtr) // Dispatch ptr not used.

+ if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.

return false;

- bool MadeChange = false;

SmallPtrSet<Instruction *, 4> HandledUses;

- for (auto *U : DispatchPtr->users()) {

+ for (auto *U : BasePtr->users()) {

CallInst *CI = cast<CallInst>(U);

if (HandledUses.insert(CI).second) {

- if (processUse(CI))

+ if (processUse(CI, IsV5OrAbove))

MadeChange = true;

}

@@ -238,6 +340,7 @@ bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {

return MadeChange;

}

INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,

"AMDGPU Kernel Attributes", false, false)

INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE,

@@ -251,17 +354,16 @@ ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {

PreservedAnalyses

AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) {

- StringRef DispatchPtrName =

- Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr);

+ bool IsV5OrAbove = AMDGPU::getAmdhsaCodeObjectVersion() >= 5;

+ Function *BasePtr = getBasePtrIntrinsic(*F.getParent(), IsV5OrAbove);

- Function *DispatchPtr = F.getParent()->getFunction(DispatchPtrName);

- if (!DispatchPtr) // Dispatch ptr not used.

+ if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.

return PreservedAnalyses::all();

for (Instruction &I : instructions(F)) {

if (CallInst *CI = dyn_cast<CallInst>(&I)) {

- if (CI->getCalledFunction() == DispatchPtr)

- processUse(CI);

+ if (CI->getCalledFunction() == BasePtr)

+ processUse(CI, IsV5OrAbove);

}