Vendor import of llvm-project main llvmorg-14-init-13186-g0c553cc1af2e. - src

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2021-12-25 22:30:44 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2021-12-25 22:30:44 +0000
commit	77fc4c146f0870ffb09c1afb823ccbe742c5e6ff (patch)
tree	5c0eb39553003b9c75a901af6bc4ddabd6f2f28c /llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
parent	f65dcba83ce5035ab88a85fe17628b447eb56e1b (diff)

vendor/llvm-project/llvmorg-14-init-13186-g0c553cc1af2e

Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp')

-rw-r--r--

llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp

116

1 files changed, 67 insertions, 49 deletions

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 12d6d35a6917..6e2b5dc471bc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp

@@ -24,13 +24,6 @@

// A possible future refinement is to specialise the structure per-kernel, so

// that fields can be elided based on more expensive analysis.

-// NOTE: Since this pass will directly pack LDS (assume large LDS) into a struct

-// type which would cause allocating huge memory for struct instance within

-// every kernel. Hence, before running this pass, it is advisable to run the

-// pass "amdgpu-replace-lds-use-with-pointer" which will replace LDS uses within

-// non-kernel functions by pointers and thereby minimizes the unnecessary per

-// kernel allocation of LDS memory.

-//

//===----------------------------------------------------------------------===//

#include "AMDGPU.h"

@@ -62,6 +55,20 @@ static cl::opt<bool> SuperAlignLDSGlobals(

namespace {

+SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) {

+ SmallPtrSet<GlobalValue *, 32> UsedList;

+ SmallVector<GlobalValue *, 32> TmpVec;

+ collectUsedGlobalVariables(M, TmpVec, true);

+ UsedList.insert(TmpVec.begin(), TmpVec.end());

+ TmpVec.clear();

+ collectUsedGlobalVariables(M, TmpVec, false);

+ UsedList.insert(TmpVec.begin(), TmpVec.end());

+ return UsedList;

class AMDGPULowerModuleLDS : public ModulePass {

static void removeFromUsedList(Module &M, StringRef Name,

@@ -105,11 +112,9 @@ class AMDGPULowerModuleLDS : public ModulePass {

removeFromUsedLists(Module &M,

const std::vector<GlobalVariable *> &LocalVars) {

SmallPtrSet<Constant *, 32> LocalVarsSet;

- for (size_t I = 0; I < LocalVars.size(); I++) {

- if (Constant *C = dyn_cast<Constant>(LocalVars[I]->stripPointerCasts())) {

+ for (GlobalVariable *LocalVar : LocalVars)

+ if (Constant *C = dyn_cast<Constant>(LocalVar->stripPointerCasts()))

LocalVarsSet.insert(C);

- }

removeFromUsedList(M, "llvm.used", LocalVarsSet);

removeFromUsedList(M, "llvm.compiler.used", LocalVarsSet);

}

@@ -158,9 +163,9 @@ public:

}

bool runOnModule(Module &M) override {

- UsedList = AMDGPU::getUsedList(M);

- bool Changed = processUsedLDS(M);

+ UsedList = getUsedList(M);

+ bool Changed = superAlignLDSGlobals(M);

+ Changed |= processUsedLDS(M);

for (Function &F : M.functions()) {

if (F.isDeclaration())

@@ -177,6 +182,50 @@ public:

}

private:

+ // Increase the alignment of LDS globals if necessary to maximise the chance

+ // that we can use aligned LDS instructions to access them.

+ static bool superAlignLDSGlobals(Module &M) {

+ const DataLayout &DL = M.getDataLayout();

+ bool Changed = false;

+ if (!SuperAlignLDSGlobals) {

+ return Changed;

+ }

+ for (auto &GV : M.globals()) {

+ if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {

+ // Only changing alignment of LDS variables

+ continue;

+ }

+ if (!GV.hasInitializer()) {

+ // cuda/hip extern __shared__ variable, leave alignment alone

+ continue;

+ }

+ Align Alignment = AMDGPU::getAlign(DL, &GV);

+ TypeSize GVSize = DL.getTypeAllocSize(GV.getValueType());

+ if (GVSize > 8) {

+ // We might want to use a b96 or b128 load/store

+ Alignment = std::max(Alignment, Align(16));

+ } else if (GVSize > 4) {

+ // We might want to use a b64 load/store

+ Alignment = std::max(Alignment, Align(8));

+ } else if (GVSize > 2) {

+ // We might want to use a b32 load/store

+ Alignment = std::max(Alignment, Align(4));

+ } else if (GVSize > 1) {

+ // We might want to use a b16 load/store

+ Alignment = std::max(Alignment, Align(2));

+ }

+ if (Alignment != AMDGPU::getAlign(DL, &GV)) {

+ Changed = true;

+ GV.setAlignment(Alignment);

+ }

+ return Changed;

+ }

bool processUsedLDS(Module &M, Function *F = nullptr) {

LLVMContext &Ctx = M.getContext();

const DataLayout &DL = M.getDataLayout();

@@ -190,31 +239,6 @@ private:

return false;

}

- // Increase the alignment of LDS globals if necessary to maximise the chance

- // that we can use aligned LDS instructions to access them.

- if (SuperAlignLDSGlobals) {

- for (auto *GV : FoundLocalVars) {

- Align Alignment = AMDGPU::getAlign(DL, GV);

- TypeSize GVSize = DL.getTypeAllocSize(GV->getValueType());

- if (GVSize > 8) {

- // We might want to use a b96 or b128 load/store

- Alignment = std::max(Alignment, Align(16));

- } else if (GVSize > 4) {

- // We might want to use a b64 load/store

- Alignment = std::max(Alignment, Align(8));

- } else if (GVSize > 2) {

- // We might want to use a b32 load/store

- Alignment = std::max(Alignment, Align(4));

- } else if (GVSize > 1) {

- // We might want to use a b16 load/store

- Alignment = std::max(Alignment, Align(2));

- }

- GV->setAlignment(Alignment);

- }

SmallVector<OptimizedStructLayoutField, 8> LayoutFields;

LayoutFields.reserve(FoundLocalVars.size());

for (GlobalVariable *GV : FoundLocalVars) {

@@ -343,20 +367,14 @@ private:

refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias);

}

- // Mark kernels with asm that reads the address of the allocated structure

- // This is not necessary for lowering. This lets other passes, specifically

- // PromoteAlloca, accurately calculate how much LDS will be used by the

- // kernel after lowering.

+ // This ensures the variable is allocated when called functions access it.

+ // It also lets other passes, specifically PromoteAlloca, accurately

+ // calculate how much LDS will be used by the kernel after lowering.

if (!F) {

IRBuilder<> Builder(Ctx);

- SmallPtrSet<Function *, 32> Kernels;

for (Function &Func : M.functions()) {

- if (Func.isDeclaration())

- continue;

- if (AMDGPU::isKernelCC(&Func) && !Kernels.contains(&Func)) {

+ if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) {

markUsedByKernel(Builder, &Func, SGV);

- Kernels.insert(&Func);

}