diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2021-12-25 22:30:44 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2021-12-25 22:30:44 +0000 |
| commit | 77fc4c146f0870ffb09c1afb823ccbe742c5e6ff (patch) | |
| tree | 5c0eb39553003b9c75a901af6bc4ddabd6f2f28c /llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp | |
| parent | f65dcba83ce5035ab88a85fe17628b447eb56e1b (diff) | |
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp | 116 |
1 files changed, 67 insertions, 49 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index 12d6d35a6917..6e2b5dc471bc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -24,13 +24,6 @@ // A possible future refinement is to specialise the structure per-kernel, so // that fields can be elided based on more expensive analysis. // -// NOTE: Since this pass will directly pack LDS (assume large LDS) into a struct -// type which would cause allocating huge memory for struct instance within -// every kernel. Hence, before running this pass, it is advisable to run the -// pass "amdgpu-replace-lds-use-with-pointer" which will replace LDS uses within -// non-kernel functions by pointers and thereby minimizes the unnecessary per -// kernel allocation of LDS memory. -// //===----------------------------------------------------------------------===// #include "AMDGPU.h" @@ -62,6 +55,20 @@ static cl::opt<bool> SuperAlignLDSGlobals( namespace { +SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) { + SmallPtrSet<GlobalValue *, 32> UsedList; + + SmallVector<GlobalValue *, 32> TmpVec; + collectUsedGlobalVariables(M, TmpVec, true); + UsedList.insert(TmpVec.begin(), TmpVec.end()); + + TmpVec.clear(); + collectUsedGlobalVariables(M, TmpVec, false); + UsedList.insert(TmpVec.begin(), TmpVec.end()); + + return UsedList; +} + class AMDGPULowerModuleLDS : public ModulePass { static void removeFromUsedList(Module &M, StringRef Name, @@ -105,11 +112,9 @@ class AMDGPULowerModuleLDS : public ModulePass { removeFromUsedLists(Module &M, const std::vector<GlobalVariable *> &LocalVars) { SmallPtrSet<Constant *, 32> LocalVarsSet; - for (size_t I = 0; I < LocalVars.size(); I++) { - if (Constant *C = dyn_cast<Constant>(LocalVars[I]->stripPointerCasts())) { + for (GlobalVariable *LocalVar : LocalVars) + if (Constant *C = dyn_cast<Constant>(LocalVar->stripPointerCasts())) LocalVarsSet.insert(C); - } - } removeFromUsedList(M, "llvm.used", LocalVarsSet); removeFromUsedList(M, "llvm.compiler.used", LocalVarsSet); } @@ -158,9 +163,9 @@ public: } bool runOnModule(Module &M) override { - UsedList = AMDGPU::getUsedList(M); - - bool Changed = processUsedLDS(M); + UsedList = getUsedList(M); + bool Changed = superAlignLDSGlobals(M); + Changed |= processUsedLDS(M); for (Function &F : M.functions()) { if (F.isDeclaration()) @@ -177,6 +182,50 @@ public: } private: + // Increase the alignment of LDS globals if necessary to maximise the chance + // that we can use aligned LDS instructions to access them. + static bool superAlignLDSGlobals(Module &M) { + const DataLayout &DL = M.getDataLayout(); + bool Changed = false; + if (!SuperAlignLDSGlobals) { + return Changed; + } + + for (auto &GV : M.globals()) { + if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { + // Only changing alignment of LDS variables + continue; + } + if (!GV.hasInitializer()) { + // cuda/hip extern __shared__ variable, leave alignment alone + continue; + } + + Align Alignment = AMDGPU::getAlign(DL, &GV); + TypeSize GVSize = DL.getTypeAllocSize(GV.getValueType()); + + if (GVSize > 8) { + // We might want to use a b96 or b128 load/store + Alignment = std::max(Alignment, Align(16)); + } else if (GVSize > 4) { + // We might want to use a b64 load/store + Alignment = std::max(Alignment, Align(8)); + } else if (GVSize > 2) { + // We might want to use a b32 load/store + Alignment = std::max(Alignment, Align(4)); + } else if (GVSize > 1) { + // We might want to use a b16 load/store + Alignment = std::max(Alignment, Align(2)); + } + + if (Alignment != AMDGPU::getAlign(DL, &GV)) { + Changed = true; + GV.setAlignment(Alignment); + } + } + return Changed; + } + bool processUsedLDS(Module &M, Function *F = nullptr) { LLVMContext &Ctx = M.getContext(); const DataLayout &DL = M.getDataLayout(); @@ -190,31 +239,6 @@ private: return false; } - // Increase the alignment of LDS globals if necessary to maximise the chance - // that we can use aligned LDS instructions to access them. - if (SuperAlignLDSGlobals) { - for (auto *GV : FoundLocalVars) { - Align Alignment = AMDGPU::getAlign(DL, GV); - TypeSize GVSize = DL.getTypeAllocSize(GV->getValueType()); - - if (GVSize > 8) { - // We might want to use a b96 or b128 load/store - Alignment = std::max(Alignment, Align(16)); - } else if (GVSize > 4) { - // We might want to use a b64 load/store - Alignment = std::max(Alignment, Align(8)); - } else if (GVSize > 2) { - // We might want to use a b32 load/store - Alignment = std::max(Alignment, Align(4)); - } else if (GVSize > 1) { - // We might want to use a b16 load/store - Alignment = std::max(Alignment, Align(2)); - } - - GV->setAlignment(Alignment); - } - } - SmallVector<OptimizedStructLayoutField, 8> LayoutFields; LayoutFields.reserve(FoundLocalVars.size()); for (GlobalVariable *GV : FoundLocalVars) { @@ -343,20 +367,14 @@ private: refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias); } - // Mark kernels with asm that reads the address of the allocated structure - // This is not necessary for lowering. This lets other passes, specifically - // PromoteAlloca, accurately calculate how much LDS will be used by the - // kernel after lowering. + // This ensures the variable is allocated when called functions access it. + // It also lets other passes, specifically PromoteAlloca, accurately + // calculate how much LDS will be used by the kernel after lowering. if (!F) { IRBuilder<> Builder(Ctx); - SmallPtrSet<Function *, 32> Kernels; for (Function &Func : M.functions()) { - if (Func.isDeclaration()) - continue; - - if (AMDGPU::isKernelCC(&Func) && !Kernels.contains(&Func)) { + if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) { markUsedByKernel(Builder, &Func, SGV); - Kernels.insert(&Func); } } } |
