diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp | 212 |
1 files changed, 136 insertions, 76 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index b4a8766d682e..56a9a30bc59a 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -29,6 +29,8 @@ #include "AMDGPU.h" #include "Utils/AMDGPUBaseInfo.h" #include "Utils/AMDGPUMemoryUtils.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/IR/Constants.h" @@ -43,6 +45,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/OptimizedStructLayout.h" #include "llvm/Transforms/Utils/ModuleUtils.h" +#include <tuple> #include <vector> #define DEBUG_TYPE "amdgpu-lower-module-lds" @@ -97,6 +100,9 @@ class AMDGPULowerModuleLDS : public ModulePass { static void removeFromUsedLists(Module &M, const std::vector<GlobalVariable *> &LocalVars) { + // The verifier rejects used lists containing an inttoptr of a constant + // so remove the variables from these lists before replaceAllUsesWith + SmallPtrSet<Constant *, 32> LocalVarsSet; for (GlobalVariable *LocalVar : LocalVars) if (Constant *C = dyn_cast<Constant>(LocalVar->stripPointerCasts())) @@ -146,12 +152,59 @@ public: } bool runOnModule(Module &M) override { + LLVMContext &Ctx = M.getContext(); CallGraph CG = CallGraph(M); bool Changed = superAlignLDSGlobals(M); + + // Move variables used by functions into amdgcn.module.lds std::vector<GlobalVariable *> ModuleScopeVariables = AMDGPU::findVariablesToLower(M, nullptr); - Changed |= processUsedLDS(CG, M, ModuleScopeVariables); + if (!ModuleScopeVariables.empty()) { + std::string VarName = "llvm.amdgcn.module.lds"; + + GlobalVariable *SGV; + DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP; + std::tie(SGV, LDSVarsToConstantGEP) = + createLDSVariableReplacement(M, VarName, ModuleScopeVariables); + + appendToCompilerUsed( + M, {static_cast<GlobalValue *>( + ConstantExpr::getPointerBitCastOrAddrSpaceCast( + cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))}); + + removeFromUsedLists(M, ModuleScopeVariables); + replaceLDSVariablesWithStruct(M, ModuleScopeVariables, SGV, + LDSVarsToConstantGEP, + [](Use &) { return true; }); + + // This ensures the variable is allocated when called functions access it. + // It also lets other passes, specifically PromoteAlloca, accurately + // calculate how much LDS will be used by the kernel after lowering. + IRBuilder<> Builder(Ctx); + for (Function &Func : M.functions()) { + if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) { + const CallGraphNode *N = CG[&Func]; + const bool CalleesRequireModuleLDS = N->size() > 0; + + if (CalleesRequireModuleLDS) { + // If a function this kernel might call requires module LDS, + // annotate the kernel to let later passes know it will allocate + // this structure, even if not apparent from the IR. + markUsedByKernel(Builder, &Func, SGV); + } else { + // However if we are certain this kernel cannot call a function that + // requires module LDS, annotate the kernel so the backend can elide + // the allocation without repeating callgraph walks. + Func.addFnAttr("amdgpu-elide-module-lds"); + } + } + } + + Changed = true; + } + + // Move variables used by kernels into per-kernel instances for (Function &F : M.functions()) { if (F.isDeclaration()) continue; @@ -159,9 +212,37 @@ public: // Only lower compute kernels' LDS. if (!AMDGPU::isKernel(F.getCallingConv())) continue; + std::vector<GlobalVariable *> KernelUsedVariables = AMDGPU::findVariablesToLower(M, &F); - Changed |= processUsedLDS(CG, M, KernelUsedVariables, &F); + + // Replace all constant uses with instructions if they belong to the + // current kernel. Unnecessary, removing will cause test churn. + for (size_t I = 0; I < KernelUsedVariables.size(); I++) { + GlobalVariable *GV = KernelUsedVariables[I]; + for (User *U : make_early_inc_range(GV->users())) { + if (ConstantExpr *C = dyn_cast<ConstantExpr>(U)) + AMDGPU::replaceConstantUsesInFunction(C, &F); + } + GV->removeDeadConstantUsers(); + } + + if (!KernelUsedVariables.empty()) { + std::string VarName = + (Twine("llvm.amdgcn.kernel.") + F.getName() + ".lds").str(); + GlobalVariable *SGV; + DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP; + std::tie(SGV, LDSVarsToConstantGEP) = + createLDSVariableReplacement(M, VarName, KernelUsedVariables); + + removeFromUsedLists(M, KernelUsedVariables); + replaceLDSVariablesWithStruct( + M, KernelUsedVariables, SGV, LDSVarsToConstantGEP, [&F](Use &U) { + Instruction *I = dyn_cast<Instruction>(U.getUser()); + return I && I->getFunction() == &F; + }); + Changed = true; + } } return Changed; @@ -212,16 +293,18 @@ private: return Changed; } - bool processUsedLDS(CallGraph const &CG, Module &M, - std::vector<GlobalVariable *> const &LDSVarsToTransform, - Function *F = nullptr) { + std::tuple<GlobalVariable *, DenseMap<GlobalVariable *, Constant *>> + createLDSVariableReplacement( + Module &M, std::string VarName, + std::vector<GlobalVariable *> const &LDSVarsToTransform) { + // Create a struct instance containing LDSVarsToTransform and map from those + // variables to ConstantExprGEP + // Variables may be introduced to meet alignment requirements. No aliasing + // metadata is useful for these as they have no uses. Erased before return. + LLVMContext &Ctx = M.getContext(); const DataLayout &DL = M.getDataLayout(); - - if (LDSVarsToTransform.empty()) { - // No variables to rewrite, no changes made. - return false; - } + assert(!LDSVarsToTransform.empty()); SmallVector<OptimizedStructLayoutField, 8> LayoutFields; LayoutFields.reserve(LDSVarsToTransform.size()); @@ -234,9 +317,10 @@ private: performOptimizedStructLayout(LayoutFields); std::vector<GlobalVariable *> LocalVars; + BitVector IsPaddingField; LocalVars.reserve(LDSVarsToTransform.size()); // will be at least this large + IsPaddingField.reserve(LDSVarsToTransform.size()); { - // This usually won't need to insert any padding, perhaps avoid the alloc uint64_t CurrentOffset = 0; for (size_t I = 0; I < LayoutFields.size(); I++) { GlobalVariable *FGV = static_cast<GlobalVariable *>( @@ -256,10 +340,12 @@ private: M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy), "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false)); + IsPaddingField.push_back(true); CurrentOffset += Padding; } LocalVars.push_back(FGV); + IsPaddingField.push_back(false); CurrentOffset += LayoutFields[I].Size; } } @@ -270,9 +356,6 @@ private: LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes), [](const GlobalVariable *V) -> Type * { return V->getValueType(); }); - std::string VarName( - F ? (Twine("llvm.amdgcn.kernel.") + F->getName() + ".lds").str() - : "llvm.amdgcn.module.lds"); StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t"); Align StructAlign = @@ -283,62 +366,65 @@ private: VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false); SGV->setAlignment(StructAlign); - if (!F) { - appendToCompilerUsed( - M, {static_cast<GlobalValue *>( - ConstantExpr::getPointerBitCastOrAddrSpaceCast( - cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))}); + + DenseMap<GlobalVariable *, Constant *> Map; + Type *I32 = Type::getInt32Ty(Ctx); + for (size_t I = 0; I < LocalVars.size(); I++) { + GlobalVariable *GV = LocalVars[I]; + Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)}; + Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx, true); + if (IsPaddingField[I]) { + assert(GV->use_empty()); + GV->eraseFromParent(); + } else { + Map[GV] = GEP; + } } + assert(Map.size() == LDSVarsToTransform.size()); + return {SGV, std::move(Map)}; + } - // The verifier rejects used lists containing an inttoptr of a constant - // so remove the variables from these lists before replaceAllUsesWith - removeFromUsedLists(M, LocalVars); + template <typename PredicateTy> + void replaceLDSVariablesWithStruct( + Module &M, std::vector<GlobalVariable *> const &LDSVarsToTransform, + GlobalVariable *SGV, + DenseMap<GlobalVariable *, Constant *> &LDSVarsToConstantGEP, + PredicateTy Predicate) { + LLVMContext &Ctx = M.getContext(); + const DataLayout &DL = M.getDataLayout(); // Create alias.scope and their lists. Each field in the new structure // does not alias with all other fields. SmallVector<MDNode *> AliasScopes; SmallVector<Metadata *> NoAliasList; - if (LocalVars.size() > 1) { + const size_t NumberVars = LDSVarsToTransform.size(); + if (NumberVars > 1) { MDBuilder MDB(Ctx); - AliasScopes.reserve(LocalVars.size()); + AliasScopes.reserve(NumberVars); MDNode *Domain = MDB.createAnonymousAliasScopeDomain(); - for (size_t I = 0; I < LocalVars.size(); I++) { + for (size_t I = 0; I < NumberVars; I++) { MDNode *Scope = MDB.createAnonymousAliasScope(Domain); AliasScopes.push_back(Scope); } NoAliasList.append(&AliasScopes[1], AliasScopes.end()); } - // Replace uses of ith variable with a constantexpr to the ith field of the - // instance that will be allocated by AMDGPUMachineFunction - Type *I32 = Type::getInt32Ty(Ctx); - for (size_t I = 0; I < LocalVars.size(); I++) { - GlobalVariable *GV = LocalVars[I]; - Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)}; - Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx); - if (F) { - // Replace all constant uses with instructions if they belong to the - // current kernel. - for (User *U : make_early_inc_range(GV->users())) { - if (ConstantExpr *C = dyn_cast<ConstantExpr>(U)) - AMDGPU::replaceConstantUsesInFunction(C, F); - } - - GV->removeDeadConstantUsers(); + // Replace uses of ith variable with a constantexpr to the corresponding + // field of the instance that will be allocated by AMDGPUMachineFunction + for (size_t I = 0; I < NumberVars; I++) { + GlobalVariable *GV = LDSVarsToTransform[I]; + Constant *GEP = LDSVarsToConstantGEP[GV]; - GV->replaceUsesWithIf(GEP, [F](Use &U) { - Instruction *I = dyn_cast<Instruction>(U.getUser()); - return I && I->getFunction() == F; - }); - } else { - GV->replaceAllUsesWith(GEP); - } + GV->replaceUsesWithIf(GEP, Predicate); if (GV->use_empty()) { GV->eraseFromParent(); } - uint64_t Off = DL.getStructLayout(LDSTy)->getElementOffset(I); - Align A = commonAlignment(StructAlign, Off); + APInt APOff(DL.getIndexTypeSizeInBits(GEP->getType()), 0); + GEP->stripAndAccumulateInBoundsConstantOffsets(DL, APOff); + uint64_t Offset = APOff.getZExtValue(); + + Align A = commonAlignment(SGV->getAlign().valueOrOne(), Offset); if (I) NoAliasList[I - 1] = AliasScopes[I - 1]; @@ -349,32 +435,6 @@ private: refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias); } - - // This ensures the variable is allocated when called functions access it. - // It also lets other passes, specifically PromoteAlloca, accurately - // calculate how much LDS will be used by the kernel after lowering. - if (!F) { - IRBuilder<> Builder(Ctx); - for (Function &Func : M.functions()) { - if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) { - const CallGraphNode *N = CG[&Func]; - const bool CalleesRequireModuleLDS = N->size() > 0; - - if (CalleesRequireModuleLDS) { - // If a function this kernel might call requires module LDS, - // annotate the kernel to let later passes know it will allocate - // this structure, even if not apparent from the IR. - markUsedByKernel(Builder, &Func, SGV); - } else { - // However if we are certain this kernel cannot call a function that - // requires module LDS, annotate the kernel so the backend can elide - // the allocation without repeating callgraph walks. - Func.addFnAttr("amdgpu-elide-module-lds"); - } - } - } - } - return true; } void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL, |