aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2021-12-25 22:30:44 +0000
committerDimitry Andric <dim@FreeBSD.org>2021-12-25 22:30:44 +0000
commit77fc4c146f0870ffb09c1afb823ccbe742c5e6ff (patch)
tree5c0eb39553003b9c75a901af6bc4ddabd6f2f28c /llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
parentf65dcba83ce5035ab88a85fe17628b447eb56e1b (diff)
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp116
1 files changed, 67 insertions, 49 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 12d6d35a6917..6e2b5dc471bc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -24,13 +24,6 @@
// A possible future refinement is to specialise the structure per-kernel, so
// that fields can be elided based on more expensive analysis.
//
-// NOTE: Since this pass will directly pack LDS (assume large LDS) into a struct
-// type which would cause allocating huge memory for struct instance within
-// every kernel. Hence, before running this pass, it is advisable to run the
-// pass "amdgpu-replace-lds-use-with-pointer" which will replace LDS uses within
-// non-kernel functions by pointers and thereby minimizes the unnecessary per
-// kernel allocation of LDS memory.
-//
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
@@ -62,6 +55,20 @@ static cl::opt<bool> SuperAlignLDSGlobals(
namespace {
+SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) {
+ SmallPtrSet<GlobalValue *, 32> UsedList;
+
+ SmallVector<GlobalValue *, 32> TmpVec;
+ collectUsedGlobalVariables(M, TmpVec, true);
+ UsedList.insert(TmpVec.begin(), TmpVec.end());
+
+ TmpVec.clear();
+ collectUsedGlobalVariables(M, TmpVec, false);
+ UsedList.insert(TmpVec.begin(), TmpVec.end());
+
+ return UsedList;
+}
+
class AMDGPULowerModuleLDS : public ModulePass {
static void removeFromUsedList(Module &M, StringRef Name,
@@ -105,11 +112,9 @@ class AMDGPULowerModuleLDS : public ModulePass {
removeFromUsedLists(Module &M,
const std::vector<GlobalVariable *> &LocalVars) {
SmallPtrSet<Constant *, 32> LocalVarsSet;
- for (size_t I = 0; I < LocalVars.size(); I++) {
- if (Constant *C = dyn_cast<Constant>(LocalVars[I]->stripPointerCasts())) {
+ for (GlobalVariable *LocalVar : LocalVars)
+ if (Constant *C = dyn_cast<Constant>(LocalVar->stripPointerCasts()))
LocalVarsSet.insert(C);
- }
- }
removeFromUsedList(M, "llvm.used", LocalVarsSet);
removeFromUsedList(M, "llvm.compiler.used", LocalVarsSet);
}
@@ -158,9 +163,9 @@ public:
}
bool runOnModule(Module &M) override {
- UsedList = AMDGPU::getUsedList(M);
-
- bool Changed = processUsedLDS(M);
+ UsedList = getUsedList(M);
+ bool Changed = superAlignLDSGlobals(M);
+ Changed |= processUsedLDS(M);
for (Function &F : M.functions()) {
if (F.isDeclaration())
@@ -177,6 +182,50 @@ public:
}
private:
+ // Increase the alignment of LDS globals if necessary to maximise the chance
+ // that we can use aligned LDS instructions to access them.
+ static bool superAlignLDSGlobals(Module &M) {
+ const DataLayout &DL = M.getDataLayout();
+ bool Changed = false;
+ if (!SuperAlignLDSGlobals) {
+ return Changed;
+ }
+
+ for (auto &GV : M.globals()) {
+ if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
+ // Only changing alignment of LDS variables
+ continue;
+ }
+ if (!GV.hasInitializer()) {
+ // cuda/hip extern __shared__ variable, leave alignment alone
+ continue;
+ }
+
+ Align Alignment = AMDGPU::getAlign(DL, &GV);
+ TypeSize GVSize = DL.getTypeAllocSize(GV.getValueType());
+
+ if (GVSize > 8) {
+ // We might want to use a b96 or b128 load/store
+ Alignment = std::max(Alignment, Align(16));
+ } else if (GVSize > 4) {
+ // We might want to use a b64 load/store
+ Alignment = std::max(Alignment, Align(8));
+ } else if (GVSize > 2) {
+ // We might want to use a b32 load/store
+ Alignment = std::max(Alignment, Align(4));
+ } else if (GVSize > 1) {
+ // We might want to use a b16 load/store
+ Alignment = std::max(Alignment, Align(2));
+ }
+
+ if (Alignment != AMDGPU::getAlign(DL, &GV)) {
+ Changed = true;
+ GV.setAlignment(Alignment);
+ }
+ }
+ return Changed;
+ }
+
bool processUsedLDS(Module &M, Function *F = nullptr) {
LLVMContext &Ctx = M.getContext();
const DataLayout &DL = M.getDataLayout();
@@ -190,31 +239,6 @@ private:
return false;
}
- // Increase the alignment of LDS globals if necessary to maximise the chance
- // that we can use aligned LDS instructions to access them.
- if (SuperAlignLDSGlobals) {
- for (auto *GV : FoundLocalVars) {
- Align Alignment = AMDGPU::getAlign(DL, GV);
- TypeSize GVSize = DL.getTypeAllocSize(GV->getValueType());
-
- if (GVSize > 8) {
- // We might want to use a b96 or b128 load/store
- Alignment = std::max(Alignment, Align(16));
- } else if (GVSize > 4) {
- // We might want to use a b64 load/store
- Alignment = std::max(Alignment, Align(8));
- } else if (GVSize > 2) {
- // We might want to use a b32 load/store
- Alignment = std::max(Alignment, Align(4));
- } else if (GVSize > 1) {
- // We might want to use a b16 load/store
- Alignment = std::max(Alignment, Align(2));
- }
-
- GV->setAlignment(Alignment);
- }
- }
-
SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
LayoutFields.reserve(FoundLocalVars.size());
for (GlobalVariable *GV : FoundLocalVars) {
@@ -343,20 +367,14 @@ private:
refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias);
}
- // Mark kernels with asm that reads the address of the allocated structure
- // This is not necessary for lowering. This lets other passes, specifically
- // PromoteAlloca, accurately calculate how much LDS will be used by the
- // kernel after lowering.
+ // This ensures the variable is allocated when called functions access it.
+ // It also lets other passes, specifically PromoteAlloca, accurately
+ // calculate how much LDS will be used by the kernel after lowering.
if (!F) {
IRBuilder<> Builder(Ctx);
- SmallPtrSet<Function *, 32> Kernels;
for (Function &Func : M.functions()) {
- if (Func.isDeclaration())
- continue;
-
- if (AMDGPU::isKernelCC(&Func) && !Kernels.contains(&Func)) {
+ if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) {
markUsedByKernel(Builder, &Func, SGV);
- Kernels.insert(&Func);
}
}
}