diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2023-07-26 19:03:47 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2023-07-26 19:04:23 +0000 |
| commit | 7fa27ce4a07f19b07799a767fc29416f3b625afb (patch) | |
| tree | 27825c83636c4de341eb09a74f49f5d38a15d165 /llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp | |
| parent | e3b557809604d036af6e00c60f012c2025b59a5e (diff) | |
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp | 981 |
1 files changed, 645 insertions, 336 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index 11ba5c91dae9..e3a645977f92 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -20,9 +20,8 @@ // This model means the GPU runtime can specify the amount of memory allocated. // If this is more than the kernel assumed, the excess can be made available // using a language specific feature, which IR represents as a variable with -// no initializer. This feature is not yet implemented for non-kernel functions. -// This lowering could be extended to handle that use case, but would probably -// require closer integration with promoteAllocaToLDS. +// no initializer. This feature is referred to here as "Dynamic LDS" and is +// lowered slightly differently to the normal case. // // Consequences of this GPU feature: // - memory is limited and exceeding it halts compilation @@ -65,17 +64,15 @@ // Kernel | Yes | Yes | No | // Hybrid | Yes | Partial | Yes | // -// Module spends LDS memory to save cycles. Table spends cycles and global -// memory to save LDS. Kernel is as fast as kernel allocation but only works -// for variables that are known reachable from a single kernel. Hybrid picks -// between all three. When forced to choose between LDS and cycles it minimises +// "Module" spends LDS memory to save cycles. "Table" spends cycles and global +// memory to save LDS. "Kernel" is as fast as kernel allocation but only works +// for variables that are known reachable from a single kernel. "Hybrid" picks +// between all three. When forced to choose between LDS and cycles we minimise // LDS use. // The "module" lowering implemented here finds LDS variables which are used by // non-kernel functions and creates a new struct with a field for each of those // LDS variables. Variables that are only used from kernels are excluded. -// Kernels that do not use this struct are annoteated with the attribute -// amdgpu-elide-module-lds which allows the back end to elide the allocation. // // The "table" lowering implemented here has three components. // First kernels are assigned a unique integer identifier which is available in @@ -115,6 +112,68 @@ // use LDS are expected to hit the "Kernel" lowering strategy // - The runtime properties impose a cost in compiler implementation complexity // +// Dynamic LDS implementation +// Dynamic LDS is lowered similarly to the "table" strategy above and uses the +// same intrinsic to identify which kernel is at the root of the dynamic call +// graph. This relies on the specified behaviour that all dynamic LDS variables +// alias one another, i.e. are at the same address, with respect to a given +// kernel. Therefore this pass creates new dynamic LDS variables for each kernel +// that allocates any dynamic LDS and builds a table of addresses out of those. +// The AMDGPUPromoteAlloca pass skips kernels that use dynamic LDS. +// The corresponding optimisation for "kernel" lowering where the table lookup +// is elided is not implemented. +// +// +// Implementation notes / limitations +// A single LDS global variable represents an instance per kernel that can reach +// said variables. This pass essentially specialises said variables per kernel. +// Handling ConstantExpr during the pass complicated this significantly so now +// all ConstantExpr uses of LDS variables are expanded to instructions. This +// may need amending when implementing non-undef initialisers. +// +// Lowering is split between this IR pass and the back end. This pass chooses +// where given variables should be allocated and marks them with metadata, +// MD_absolute_symbol. The backend places the variables in coincidentally the +// same location and raises a fatal error if something has gone awry. This works +// in practice because the only pass between this one and the backend that +// changes LDS is PromoteAlloca and the changes it makes do not conflict. +// +// Addresses are written to constant global arrays based on the same metadata. +// +// The backend lowers LDS variables in the order of traversal of the function. +// This is at odds with the deterministic layout required. The workaround is to +// allocate the fixed-address variables immediately upon starting the function +// where they can be placed as intended. This requires a means of mapping from +// the function to the variables that it allocates. For the module scope lds, +// this is via metadata indicating whether the variable is not required. If a +// pass deletes that metadata, a fatal error on disagreement with the absolute +// symbol metadata will occur. For kernel scope and dynamic, this is by _name_ +// correspondence between the function and the variable. It requires the +// kernel to have a name (which is only a limitation for tests in practice) and +// for nothing to rename the corresponding symbols. This is a hazard if the pass +// is run multiple times during debugging. Alternative schemes considered all +// involve bespoke metadata. +// +// If the name correspondence can be replaced, multiple distinct kernels that +// have the same memory layout can map to the same kernel id (as the address +// itself is handled by the absolute symbol metadata) and that will allow more +// uses of the "kernel" style faster lowering and reduce the size of the lookup +// tables. +// +// There is a test that checks this does not fire for a graphics shader. This +// lowering is expected to work for graphics if the isKernel test is changed. +// +// The current markUsedByKernel is sufficient for PromoteAlloca but is elided +// before codegen. Replacing this with an equivalent intrinsic which lasts until +// shortly after the machine function lowering of LDS would help break the name +// mapping. The other part needed is probably to amend PromoteAlloca to embed +// the LDS variables it creates in the same struct created here. That avoids the +// current hazard where a PromoteAlloca LDS variable might be allocated before +// the kernel scope (and thus error on the address check). Given a new invariant +// that no LDS variables exist outside of the structs managed here, and an +// intrinsic that lasts until after the LDS frame lowering, it should be +// possible to drop the name mapping and fold equivalent memory layouts. +// //===----------------------------------------------------------------------===// #include "AMDGPU.h" @@ -134,11 +193,14 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/IR/ReplaceConstant.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/Format.h" #include "llvm/Support/OptimizedStructLayout.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/ModuleUtils.h" @@ -162,7 +224,7 @@ enum class LoweringKind { module, table, kernel, hybrid }; cl::opt<LoweringKind> LoweringKindLoc( "amdgpu-lower-module-lds-strategy", cl::desc("Specify lowering strategy for function LDS access:"), cl::Hidden, - cl::init(LoweringKind::module), + cl::init(LoweringKind::hybrid), cl::values( clEnumValN(LoweringKind::table, "table", "Lower via table lookup"), clEnumValN(LoweringKind::module, "module", "Lower via module struct"), @@ -183,6 +245,13 @@ bool isKernelLDS(const Function *F) { return AMDGPU::isKernel(F->getCallingConv()); } +template <typename T> std::vector<T> sortByName(std::vector<T> &&V) { + llvm::sort(V.begin(), V.end(), [](const auto *L, const auto *R) { + return L->getName() < R->getName(); + }); + return {std::move(V)}; +} + class AMDGPULowerModuleLDS : public ModulePass { static void @@ -201,8 +270,7 @@ class AMDGPULowerModuleLDS : public ModulePass { LocalVar->removeDeadConstantUsers(); } - static void markUsedByKernel(IRBuilder<> &Builder, Function *Func, - GlobalVariable *SGV) { + static void markUsedByKernel(Function *Func, GlobalVariable *SGV) { // The llvm.amdgcn.module.lds instance is implicitly used by all kernels // that might call a function which accesses a field within it. This is // presently approximated to 'all kernels' if there are any such functions @@ -217,21 +285,22 @@ class AMDGPULowerModuleLDS : public ModulePass { // llvm.donothing that takes a pointer to the instance and is lowered to a // no-op after LDS is allocated, but that is not presently necessary. - LLVMContext &Ctx = Func->getContext(); - - Builder.SetInsertPoint(Func->getEntryBlock().getFirstNonPHI()); - - FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), {}); + // This intrinsic is eliminated shortly before instruction selection. It + // does not suffice to indicate to ISel that a given global which is not + // immediately used by the kernel must still be allocated by it. An + // equivalent target specific intrinsic which lasts until immediately after + // codegen would suffice for that, but one would still need to ensure that + // the variables are allocated in the anticpated order. + IRBuilder<> Builder(Func->getEntryBlock().getFirstNonPHI()); Function *Decl = Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {}); - Value *UseInstance[1] = {Builder.CreateInBoundsGEP( - SGV->getValueType(), SGV, ConstantInt::get(Type::getInt32Ty(Ctx), 0))}; + Value *UseInstance[1] = { + Builder.CreateConstInBoundsGEP1_32(SGV->getValueType(), SGV, 0)}; - Builder.CreateCall(FTy, Decl, {}, - {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)}, - ""); + Builder.CreateCall( + Decl, {}, {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)}); } static bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) { @@ -240,7 +309,7 @@ class AMDGPULowerModuleLDS : public ModulePass { // This pass specialises LDS variables with respect to the kernel that // allocates them. - // This is semantically equivalent to: + // This is semantically equivalent to (the unimplemented as slow): // for (auto &F : M.functions()) // for (auto &BB : F) // for (auto &I : BB) @@ -248,63 +317,12 @@ class AMDGPULowerModuleLDS : public ModulePass { // if (constantExprUsesLDS(Op)) // replaceConstantExprInFunction(I, Op); - bool Changed = false; - - // Find all ConstantExpr that are direct users of an LDS global - SmallVector<ConstantExpr *> Stack; + SmallVector<Constant *> LDSGlobals; for (auto &GV : M.globals()) if (AMDGPU::isLDSVariableToLower(GV)) - for (User *U : GV.users()) - if (ConstantExpr *C = dyn_cast<ConstantExpr>(U)) - Stack.push_back(C); - - // Expand to include constexpr users of direct users - SetVector<ConstantExpr *> ConstExprUsersOfLDS; - while (!Stack.empty()) { - ConstantExpr *V = Stack.pop_back_val(); - if (ConstExprUsersOfLDS.contains(V)) - continue; - - ConstExprUsersOfLDS.insert(V); + LDSGlobals.push_back(&GV); - for (auto *Nested : V->users()) - if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Nested)) - Stack.push_back(CE); - } - - // Find all instructions that use any of the ConstExpr users of LDS - SetVector<Instruction *> InstructionWorklist; - for (ConstantExpr *CE : ConstExprUsersOfLDS) - for (User *U : CE->users()) - if (auto *I = dyn_cast<Instruction>(U)) - InstructionWorklist.insert(I); - - // Replace those ConstExpr operands with instructions - while (!InstructionWorklist.empty()) { - Instruction *I = InstructionWorklist.pop_back_val(); - for (Use &U : I->operands()) { - - auto *BI = I; - if (auto *Phi = dyn_cast<PHINode>(I)) { - BasicBlock *BB = Phi->getIncomingBlock(U); - BasicBlock::iterator It = BB->getFirstInsertionPt(); - assert(It != BB->end() && "Unexpected empty basic block"); - BI = &(*(It)); - } - - if (ConstantExpr *C = dyn_cast<ConstantExpr>(U.get())) { - if (ConstExprUsersOfLDS.contains(C)) { - Changed = true; - Instruction *NI = C->getAsInstruction(BI); - InstructionWorklist.insert(NI); - U.set(NI); - C->removeDeadConstantUsers(); - } - } - } - } - - return Changed; + return convertUsersOfConstantsToInstructions(LDSGlobals); } public: @@ -329,7 +347,11 @@ public: continue; } - SmallVector<User *, 16> Stack(GV.users()); + if (GV.isAbsoluteSymbolRef()) { + report_fatal_error( + "LDS variables with absolute addresses are unimplemented."); + } + for (User *V : GV.users()) { if (auto *I = dyn_cast<Instruction>(V)) { Function *F = I->getFunction(); @@ -358,11 +380,11 @@ public: DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer; for (Function &F : M.functions()) { if (!isKernelLDS(&F)) - if (F.hasAddressTaken(nullptr, - /* IgnoreCallbackUses */ false, - /* IgnoreAssumeLikeCalls */ false, - /* IgnoreLLVMUsed */ true, - /* IgnoreArcAttachedCall */ false)) { + if (F.hasAddressTaken(nullptr, + /* IgnoreCallbackUses */ false, + /* IgnoreAssumeLikeCalls */ false, + /* IgnoreLLVMUsed */ true, + /* IgnoreArcAttachedCall */ false)) { set_union(VariablesReachableThroughFunctionPointer, direct_map_function[&F]); } @@ -370,7 +392,7 @@ public: auto functionMakesUnknownCall = [&](const Function *F) -> bool { assert(!F->isDeclaration()); - for (CallGraphNode::CallRecord R : *CG[F]) { + for (const CallGraphNode::CallRecord &R : *CG[F]) { if (!R.second->getFunction()) { return true; } @@ -408,7 +430,7 @@ public: // have already been computed, with more care than this set_union(transitive_map_function[&Func], direct_map_function[F]); - for (CallGraphNode::CallRecord R : *CG[F]) { + for (const CallGraphNode::CallRecord &R : *CG[F]) { Function *ith = R.second->getFunction(); if (ith) { if (!seen.contains(ith)) { @@ -428,7 +450,7 @@ public: if (Func.isDeclaration() || !isKernelLDS(&Func)) continue; - for (CallGraphNode::CallRecord R : *CG[&Func]) { + for (const CallGraphNode::CallRecord &R : *CG[&Func]) { Function *ith = R.second->getFunction(); if (ith) { set_union(indirect_map_kernel[&Func], transitive_map_function[ith]); @@ -454,7 +476,7 @@ public: static Constant *getAddressesOfVariablesInKernel( LLVMContext &Ctx, ArrayRef<GlobalVariable *> Variables, - DenseMap<GlobalVariable *, Constant *> &LDSVarsToConstantGEP) { + const DenseMap<GlobalVariable *, Constant *> &LDSVarsToConstantGEP) { // Create a ConstantArray containing the address of each Variable within the // kernel corresponding to LDSVarsToConstantGEP, or poison if that kernel // does not allocate it @@ -467,8 +489,9 @@ public: SmallVector<Constant *> Elements; for (size_t i = 0; i < Variables.size(); i++) { GlobalVariable *GV = Variables[i]; - if (LDSVarsToConstantGEP.count(GV) != 0) { - auto elt = ConstantExpr::getPtrToInt(LDSVarsToConstantGEP[GV], I32); + auto ConstantGepIt = LDSVarsToConstantGEP.find(GV); + if (ConstantGepIt != LDSVarsToConstantGEP.end()) { + auto elt = ConstantExpr::getPtrToInt(ConstantGepIt->second, I32); Elements.push_back(elt); } else { Elements.push_back(PoisonValue::get(I32)); @@ -495,11 +518,15 @@ public: ArrayType *AllKernelsOffsetsType = ArrayType::get(KernelOffsetsType, NumberKernels); + Constant *Missing = PoisonValue::get(KernelOffsetsType); std::vector<Constant *> overallConstantExprElts(NumberKernels); for (size_t i = 0; i < NumberKernels; i++) { - LDSVariableReplacement Replacement = KernelToReplacement[kernels[i]]; - overallConstantExprElts[i] = getAddressesOfVariablesInKernel( - Ctx, Variables, Replacement.LDSVarsToConstantGEP); + auto Replacement = KernelToReplacement.find(kernels[i]); + overallConstantExprElts[i] = + (Replacement == KernelToReplacement.end()) + ? Missing + : getAddressesOfVariablesInKernel( + Ctx, Variables, Replacement->second.LDSVarsToConstantGEP); } Constant *init = @@ -511,36 +538,49 @@ public: AMDGPUAS::CONSTANT_ADDRESS); } - void replaceUsesInInstructionsWithTableLookup( - Module &M, ArrayRef<GlobalVariable *> ModuleScopeVariables, - GlobalVariable *LookupTable) { - + void replaceUseWithTableLookup(Module &M, IRBuilder<> &Builder, + GlobalVariable *LookupTable, + GlobalVariable *GV, Use &U, + Value *OptionalIndex) { + // Table is a constant array of the same length as OrderedKernels LLVMContext &Ctx = M.getContext(); - IRBuilder<> Builder(Ctx); Type *I32 = Type::getInt32Ty(Ctx); + auto *I = cast<Instruction>(U.getUser()); - // Accesses from a function use the amdgcn_lds_kernel_id intrinsic which - // lowers to a read from a live in register. Emit it once in the entry - // block to spare deduplicating it later. + Value *tableKernelIndex = getTableLookupKernelIndex(M, I->getFunction()); - DenseMap<Function *, Value *> tableKernelIndexCache; - auto getTableKernelIndex = [&](Function *F) -> Value * { - if (tableKernelIndexCache.count(F) == 0) { - LLVMContext &Ctx = M.getContext(); - FunctionType *FTy = FunctionType::get(Type::getInt32Ty(Ctx), {}); - Function *Decl = - Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_lds_kernel_id, {}); + if (auto *Phi = dyn_cast<PHINode>(I)) { + BasicBlock *BB = Phi->getIncomingBlock(U); + Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt()))); + } else { + Builder.SetInsertPoint(I); + } - BasicBlock::iterator it = - F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca(); - Instruction &i = *it; - Builder.SetInsertPoint(&i); + SmallVector<Value *, 3> GEPIdx = { + ConstantInt::get(I32, 0), + tableKernelIndex, + }; + if (OptionalIndex) + GEPIdx.push_back(OptionalIndex); - tableKernelIndexCache[F] = Builder.CreateCall(FTy, Decl, {}); - } + Value *Address = Builder.CreateInBoundsGEP( + LookupTable->getValueType(), LookupTable, GEPIdx, GV->getName()); - return tableKernelIndexCache[F]; - }; + Value *loaded = Builder.CreateLoad(I32, Address); + + Value *replacement = + Builder.CreateIntToPtr(loaded, GV->getType(), GV->getName()); + + U.set(replacement); + } + + void replaceUsesInInstructionsWithTableLookup( + Module &M, ArrayRef<GlobalVariable *> ModuleScopeVariables, + GlobalVariable *LookupTable) { + + LLVMContext &Ctx = M.getContext(); + IRBuilder<> Builder(Ctx); + Type *I32 = Type::getInt32Ty(Ctx); for (size_t Index = 0; Index < ModuleScopeVariables.size(); Index++) { auto *GV = ModuleScopeVariables[Index]; @@ -550,32 +590,8 @@ public: if (!I) continue; - Value *tableKernelIndex = getTableKernelIndex(I->getFunction()); - - // So if the phi uses this value multiple times, what does this look - // like? - if (auto *Phi = dyn_cast<PHINode>(I)) { - BasicBlock *BB = Phi->getIncomingBlock(U); - Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt()))); - } else { - Builder.SetInsertPoint(I); - } - - Value *GEPIdx[3] = { - ConstantInt::get(I32, 0), - tableKernelIndex, - ConstantInt::get(I32, Index), - }; - - Value *Address = Builder.CreateInBoundsGEP( - LookupTable->getValueType(), LookupTable, GEPIdx, GV->getName()); - - Value *loaded = Builder.CreateLoad(I32, Address); - - Value *replacement = - Builder.CreateIntToPtr(loaded, GV->getType(), GV->getName()); - - U.set(replacement); + replaceUseWithTableLookup(M, Builder, LookupTable, GV, U, + ConstantInt::get(I32, Index)); } } } @@ -586,7 +602,8 @@ public: DenseSet<Function *> KernelSet; - if (VariableSet.empty()) return KernelSet; + if (VariableSet.empty()) + return KernelSet; for (Function &Func : M.functions()) { if (Func.isDeclaration() || !isKernelLDS(&Func)) @@ -649,8 +666,9 @@ public: // strategy continue; } - CandidateTy Candidate(GV, K.second.size(), - DL.getTypeAllocSize(GV->getValueType()).getFixedValue()); + CandidateTy Candidate( + GV, K.second.size(), + DL.getTypeAllocSize(GV->getValueType()).getFixedValue()); if (MostUsed < Candidate) MostUsed = Candidate; } @@ -658,173 +676,258 @@ public: return MostUsed.GV; } - bool runOnModule(Module &M) override { - LLVMContext &Ctx = M.getContext(); - CallGraph CG = CallGraph(M); - bool Changed = superAlignLDSGlobals(M); + static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV, + uint32_t Address) { + // Write the specified address into metadata where it can be retrieved by + // the assembler. Format is a half open range, [Address Address+1) + LLVMContext &Ctx = M->getContext(); + auto *IntTy = + M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS); + auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address)); + auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1)); + GV->setMetadata(LLVMContext::MD_absolute_symbol, + MDNode::get(Ctx, {MinC, MaxC})); + } - Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M); + DenseMap<Function *, Value *> tableKernelIndexCache; + Value *getTableLookupKernelIndex(Module &M, Function *F) { + // Accesses from a function use the amdgcn_lds_kernel_id intrinsic which + // lowers to a read from a live in register. Emit it once in the entry + // block to spare deduplicating it later. + auto [It, Inserted] = tableKernelIndexCache.try_emplace(F); + if (Inserted) { + Function *Decl = + Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_lds_kernel_id, {}); - Changed = true; // todo: narrow this down + auto InsertAt = F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca(); + IRBuilder<> Builder(&*InsertAt); - // For each kernel, what variables does it access directly or through - // callees - LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M); + It->second = Builder.CreateCall(Decl, {}); + } - // For each variable accessed through callees, which kernels access it - VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly; - for (auto &K : LDSUsesInfo.indirect_access) { - Function *F = K.first; - assert(isKernelLDS(F)); - for (GlobalVariable *GV : K.second) { - LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F); + return It->second; + } + + static std::vector<Function *> assignLDSKernelIDToEachKernel( + Module *M, DenseSet<Function *> const &KernelsThatAllocateTableLDS, + DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS) { + // Associate kernels in the set with an arbirary but reproducible order and + // annotate them with that order in metadata. This metadata is recognised by + // the backend and lowered to a SGPR which can be read from using + // amdgcn_lds_kernel_id. + + std::vector<Function *> OrderedKernels; + if (!KernelsThatAllocateTableLDS.empty() || + !KernelsThatIndirectlyAllocateDynamicLDS.empty()) { + + for (Function &Func : M->functions()) { + if (Func.isDeclaration()) + continue; + if (!isKernelLDS(&Func)) + continue; + + if (KernelsThatAllocateTableLDS.contains(&Func) || + KernelsThatIndirectlyAllocateDynamicLDS.contains(&Func)) { + assert(Func.hasName()); // else fatal error earlier + OrderedKernels.push_back(&Func); + } + } + + // Put them in an arbitrary but reproducible order + OrderedKernels = sortByName(std::move(OrderedKernels)); + + // Annotate the kernels with their order in this vector + LLVMContext &Ctx = M->getContext(); + IRBuilder<> Builder(Ctx); + + if (OrderedKernels.size() > UINT32_MAX) { + // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU + report_fatal_error("Unimplemented LDS lowering for > 2**32 kernels"); + } + + for (size_t i = 0; i < OrderedKernels.size(); i++) { + Metadata *AttrMDArgs[1] = { + ConstantAsMetadata::get(Builder.getInt32(i)), + }; + OrderedKernels[i]->setMetadata("llvm.amdgcn.lds.kernel.id", + MDNode::get(Ctx, AttrMDArgs)); } } + return OrderedKernels; + } - // Partition variables into the different strategies - DenseSet<GlobalVariable *> ModuleScopeVariables; - DenseSet<GlobalVariable *> TableLookupVariables; - DenseSet<GlobalVariable *> KernelAccessVariables; + static void partitionVariablesIntoIndirectStrategies( + Module &M, LDSUsesInfoTy const &LDSUsesInfo, + VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly, + DenseSet<GlobalVariable *> &ModuleScopeVariables, + DenseSet<GlobalVariable *> &TableLookupVariables, + DenseSet<GlobalVariable *> &KernelAccessVariables, + DenseSet<GlobalVariable *> &DynamicVariables) { - { - GlobalVariable *HybridModuleRoot = - LoweringKindLoc != LoweringKind::hybrid - ? nullptr - : chooseBestVariableForModuleStrategy( - M.getDataLayout(), - LDSToKernelsThatNeedToAccessItIndirectly); + GlobalVariable *HybridModuleRoot = + LoweringKindLoc != LoweringKind::hybrid + ? nullptr + : chooseBestVariableForModuleStrategy( + M.getDataLayout(), LDSToKernelsThatNeedToAccessItIndirectly); - DenseSet<Function *> const EmptySet; - DenseSet<Function *> const &HybridModuleRootKernels = - HybridModuleRoot - ? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot] - : EmptySet; + DenseSet<Function *> const EmptySet; + DenseSet<Function *> const &HybridModuleRootKernels = + HybridModuleRoot + ? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot] + : EmptySet; - for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { - // Each iteration of this loop assigns exactly one global variable to - // exactly one of the implementation strategies. + for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { + // Each iteration of this loop assigns exactly one global variable to + // exactly one of the implementation strategies. - GlobalVariable *GV = K.first; - assert(AMDGPU::isLDSVariableToLower(*GV)); - assert(K.second.size() != 0); + GlobalVariable *GV = K.first; + assert(AMDGPU::isLDSVariableToLower(*GV)); + assert(K.second.size() != 0); - switch (LoweringKindLoc) { - case LoweringKind::module: - ModuleScopeVariables.insert(GV); - break; + if (AMDGPU::isDynamicLDS(*GV)) { + DynamicVariables.insert(GV); + continue; + } - case LoweringKind::table: - TableLookupVariables.insert(GV); - break; + switch (LoweringKindLoc) { + case LoweringKind::module: + ModuleScopeVariables.insert(GV); + break; - case LoweringKind::kernel: - if (K.second.size() == 1) { - KernelAccessVariables.insert(GV); - } else { - report_fatal_error( - "cannot lower LDS '" + GV->getName() + - "' to kernel access as it is reachable from multiple kernels"); - } - break; + case LoweringKind::table: + TableLookupVariables.insert(GV); + break; - case LoweringKind::hybrid: { - if (GV == HybridModuleRoot) { - assert(K.second.size() != 1); - ModuleScopeVariables.insert(GV); - } else if (K.second.size() == 1) { - KernelAccessVariables.insert(GV); - } else if (set_is_subset(K.second, HybridModuleRootKernels)) { - ModuleScopeVariables.insert(GV); - } else { - TableLookupVariables.insert(GV); - } - break; + case LoweringKind::kernel: + if (K.second.size() == 1) { + KernelAccessVariables.insert(GV); + } else { + report_fatal_error( + "cannot lower LDS '" + GV->getName() + + "' to kernel access as it is reachable from multiple kernels"); } + break; + + case LoweringKind::hybrid: { + if (GV == HybridModuleRoot) { + assert(K.second.size() != 1); + ModuleScopeVariables.insert(GV); + } else if (K.second.size() == 1) { + KernelAccessVariables.insert(GV); + } else if (set_is_subset(K.second, HybridModuleRootKernels)) { + ModuleScopeVariables.insert(GV); + } else { + TableLookupVariables.insert(GV); } + break; + } } + } - assert(ModuleScopeVariables.size() + TableLookupVariables.size() + - KernelAccessVariables.size() == - LDSToKernelsThatNeedToAccessItIndirectly.size()); - } // Variables have now been partitioned into the three lowering strategies. + // All LDS variables accessed indirectly have now been partitioned into + // the distinct lowering strategies. + assert(ModuleScopeVariables.size() + TableLookupVariables.size() + + KernelAccessVariables.size() + DynamicVariables.size() == + LDSToKernelsThatNeedToAccessItIndirectly.size()); + } - // If the kernel accesses a variable that is going to be stored in the - // module instance through a call then that kernel needs to allocate the - // module instance - DenseSet<Function *> KernelsThatAllocateModuleLDS = - kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, - ModuleScopeVariables); - DenseSet<Function *> KernelsThatAllocateTableLDS = - kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, - TableLookupVariables); + static GlobalVariable *lowerModuleScopeStructVariables( + Module &M, DenseSet<GlobalVariable *> const &ModuleScopeVariables, + DenseSet<Function *> const &KernelsThatAllocateModuleLDS) { + // Create a struct to hold the ModuleScopeVariables + // Replace all uses of those variables from non-kernel functions with the + // new struct instance Replace only the uses from kernel functions that will + // allocate this instance. That is a space optimisation - kernels that use a + // subset of the module scope struct and do not need to allocate it for + // indirect calls will only allocate the subset they use (they do so as part + // of the per-kernel lowering). + if (ModuleScopeVariables.empty()) { + return nullptr; + } - if (!ModuleScopeVariables.empty()) { - LDSVariableReplacement ModuleScopeReplacement = - createLDSVariableReplacement(M, "llvm.amdgcn.module.lds", - ModuleScopeVariables); + LLVMContext &Ctx = M.getContext(); - appendToCompilerUsed(M, - {static_cast<GlobalValue *>( - ConstantExpr::getPointerBitCastOrAddrSpaceCast( - cast<Constant>(ModuleScopeReplacement.SGV), - Type::getInt8PtrTy(Ctx)))}); + LDSVariableReplacement ModuleScopeReplacement = + createLDSVariableReplacement(M, "llvm.amdgcn.module.lds", + ModuleScopeVariables); - // historic - removeLocalVarsFromUsedLists(M, ModuleScopeVariables); + appendToCompilerUsed(M, {static_cast<GlobalValue *>( + ConstantExpr::getPointerBitCastOrAddrSpaceCast( + cast<Constant>(ModuleScopeReplacement.SGV), + Type::getInt8PtrTy(Ctx)))}); - // Replace all uses of module scope variable from non-kernel functions - replaceLDSVariablesWithStruct( - M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) { - Instruction *I = dyn_cast<Instruction>(U.getUser()); - if (!I) { - return false; - } - Function *F = I->getFunction(); - return !isKernelLDS(F); - }); + // module.lds will be allocated at zero in any kernel that allocates it + recordLDSAbsoluteAddress(&M, ModuleScopeReplacement.SGV, 0); - // Replace uses of module scope variable from kernel functions that - // allocate the module scope variable, otherwise leave them unchanged - // Record on each kernel whether the module scope global is used by it + // historic + removeLocalVarsFromUsedLists(M, ModuleScopeVariables); - LLVMContext &Ctx = M.getContext(); - IRBuilder<> Builder(Ctx); + // Replace all uses of module scope variable from non-kernel functions + replaceLDSVariablesWithStruct( + M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) { + Instruction *I = dyn_cast<Instruction>(U.getUser()); + if (!I) { + return false; + } + Function *F = I->getFunction(); + return !isKernelLDS(F); + }); - for (Function &Func : M.functions()) { - if (Func.isDeclaration() || !isKernelLDS(&Func)) - continue; + // Replace uses of module scope variable from kernel functions that + // allocate the module scope variable, otherwise leave them unchanged + // Record on each kernel whether the module scope global is used by it - if (KernelsThatAllocateModuleLDS.contains(&Func)) { - replaceLDSVariablesWithStruct( - M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) { - Instruction *I = dyn_cast<Instruction>(U.getUser()); - if (!I) { - return false; - } - Function *F = I->getFunction(); - return F == &Func; - }); + for (Function &Func : M.functions()) { + if (Func.isDeclaration() || !isKernelLDS(&Func)) + continue; - markUsedByKernel(Builder, &Func, ModuleScopeReplacement.SGV); + if (KernelsThatAllocateModuleLDS.contains(&Func)) { + replaceLDSVariablesWithStruct( + M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) { + Instruction *I = dyn_cast<Instruction>(U.getUser()); + if (!I) { + return false; + } + Function *F = I->getFunction(); + return F == &Func; + }); - } else { - Func.addFnAttr("amdgpu-elide-module-lds"); - } + markUsedByKernel(&Func, ModuleScopeReplacement.SGV); } } - // Create a struct for each kernel for the non-module-scope variables + return ModuleScopeReplacement.SGV; + } + + static DenseMap<Function *, LDSVariableReplacement> + lowerKernelScopeStructVariables( + Module &M, LDSUsesInfoTy &LDSUsesInfo, + DenseSet<GlobalVariable *> const &ModuleScopeVariables, + DenseSet<Function *> const &KernelsThatAllocateModuleLDS, + GlobalVariable *MaybeModuleScopeStruct) { + + // Create a struct for each kernel for the non-module-scope variables. + DenseMap<Function *, LDSVariableReplacement> KernelToReplacement; for (Function &Func : M.functions()) { if (Func.isDeclaration() || !isKernelLDS(&Func)) continue; DenseSet<GlobalVariable *> KernelUsedVariables; + // Allocating variables that are used directly in this struct to get + // alignment aware allocation and predictable frame size. for (auto &v : LDSUsesInfo.direct_access[&Func]) { - KernelUsedVariables.insert(v); + if (!AMDGPU::isDynamicLDS(*v)) { + KernelUsedVariables.insert(v); + } } + + // Allocating variables that are accessed indirectly so that a lookup of + // this struct instance can find them from nested functions. for (auto &v : LDSUsesInfo.indirect_access[&Func]) { - KernelUsedVariables.insert(v); + if (!AMDGPU::isDynamicLDS(*v)) { + KernelUsedVariables.insert(v); + } } // Variables allocated in module lds must all resolve to that struct, @@ -836,7 +939,8 @@ public: } if (KernelUsedVariables.empty()) { - // Either used no LDS, or all the LDS it used was also in module + // Either used no LDS, or the LDS it used was all in the module struct + // or dynamically sized continue; } @@ -856,6 +960,14 @@ public: auto Replacement = createLDSVariableReplacement(M, VarName, KernelUsedVariables); + // If any indirect uses, create a direct use to ensure allocation + // TODO: Simpler to unconditionally mark used but that regresses + // codegen in test/CodeGen/AMDGPU/noclobber-barrier.ll + auto Accesses = LDSUsesInfo.indirect_access.find(&Func); + if ((Accesses != LDSUsesInfo.indirect_access.end()) && + !Accesses->second.empty()) + markUsedByKernel(&Func, Replacement.SGV); + // remove preserves existing codegen removeLocalVarsFromUsedLists(M, KernelUsedVariables); KernelToReplacement[&Func] = Replacement; @@ -867,6 +979,169 @@ public: return I && I->getFunction() == &Func; }); } + return KernelToReplacement; + } + + static GlobalVariable * + buildRepresentativeDynamicLDSInstance(Module &M, LDSUsesInfoTy &LDSUsesInfo, + Function *func) { + // Create a dynamic lds variable with a name associated with the passed + // function that has the maximum alignment of any dynamic lds variable + // reachable from this kernel. Dynamic LDS is allocated after the static LDS + // allocation, possibly after alignment padding. The representative variable + // created here has the maximum alignment of any other dynamic variable + // reachable by that kernel. All dynamic LDS variables are allocated at the + // same address in each kernel in order to provide the documented aliasing + // semantics. Setting the alignment here allows this IR pass to accurately + // predict the exact constant at which it will be allocated. + + assert(isKernelLDS(func)); + + LLVMContext &Ctx = M.getContext(); + const DataLayout &DL = M.getDataLayout(); + Align MaxDynamicAlignment(1); + + auto UpdateMaxAlignment = [&MaxDynamicAlignment, &DL](GlobalVariable *GV) { + if (AMDGPU::isDynamicLDS(*GV)) { + MaxDynamicAlignment = + std::max(MaxDynamicAlignment, AMDGPU::getAlign(DL, GV)); + } + }; + + for (GlobalVariable *GV : LDSUsesInfo.indirect_access[func]) { + UpdateMaxAlignment(GV); + } + + for (GlobalVariable *GV : LDSUsesInfo.direct_access[func]) { + UpdateMaxAlignment(GV); + } + + assert(func->hasName()); // Checked by caller + auto emptyCharArray = ArrayType::get(Type::getInt8Ty(Ctx), 0); + GlobalVariable *N = new GlobalVariable( + M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr, + Twine("llvm.amdgcn." + func->getName() + ".dynlds"), nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, + false); + N->setAlignment(MaxDynamicAlignment); + + assert(AMDGPU::isDynamicLDS(*N)); + return N; + } + + DenseMap<Function *, GlobalVariable *> lowerDynamicLDSVariables( + Module &M, LDSUsesInfoTy &LDSUsesInfo, + DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS, + DenseSet<GlobalVariable *> const &DynamicVariables, + std::vector<Function *> const &OrderedKernels) { + DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS; + if (!KernelsThatIndirectlyAllocateDynamicLDS.empty()) { + LLVMContext &Ctx = M.getContext(); + IRBuilder<> Builder(Ctx); + Type *I32 = Type::getInt32Ty(Ctx); + + std::vector<Constant *> newDynamicLDS; + + // Table is built in the same order as OrderedKernels + for (auto &func : OrderedKernels) { + + if (KernelsThatIndirectlyAllocateDynamicLDS.contains(func)) { + assert(isKernelLDS(func)); + if (!func->hasName()) { + report_fatal_error("Anonymous kernels cannot use LDS variables"); + } + + GlobalVariable *N = + buildRepresentativeDynamicLDSInstance(M, LDSUsesInfo, func); + + KernelToCreatedDynamicLDS[func] = N; + + markUsedByKernel(func, N); + + auto emptyCharArray = ArrayType::get(Type::getInt8Ty(Ctx), 0); + auto GEP = ConstantExpr::getGetElementPtr( + emptyCharArray, N, ConstantInt::get(I32, 0), true); + newDynamicLDS.push_back(ConstantExpr::getPtrToInt(GEP, I32)); + } else { + newDynamicLDS.push_back(PoisonValue::get(I32)); + } + } + assert(OrderedKernels.size() == newDynamicLDS.size()); + + ArrayType *t = ArrayType::get(I32, newDynamicLDS.size()); + Constant *init = ConstantArray::get(t, newDynamicLDS); + GlobalVariable *table = new GlobalVariable( + M, t, true, GlobalValue::InternalLinkage, init, + "llvm.amdgcn.dynlds.offset.table", nullptr, + GlobalValue::NotThreadLocal, AMDGPUAS::CONSTANT_ADDRESS); + + for (GlobalVariable *GV : DynamicVariables) { + for (Use &U : make_early_inc_range(GV->uses())) { + auto *I = dyn_cast<Instruction>(U.getUser()); + if (!I) + continue; + if (isKernelLDS(I->getFunction())) + continue; + + replaceUseWithTableLookup(M, Builder, table, GV, U, nullptr); + } + } + } + return KernelToCreatedDynamicLDS; + } + + bool runOnModule(Module &M) override { + CallGraph CG = CallGraph(M); + bool Changed = superAlignLDSGlobals(M); + + Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M); + + Changed = true; // todo: narrow this down + + // For each kernel, what variables does it access directly or through + // callees + LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M); + + // For each variable accessed through callees, which kernels access it + VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly; + for (auto &K : LDSUsesInfo.indirect_access) { + Function *F = K.first; + assert(isKernelLDS(F)); + for (GlobalVariable *GV : K.second) { + LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F); + } + } + + // Partition variables accessed indirectly into the different strategies + DenseSet<GlobalVariable *> ModuleScopeVariables; + DenseSet<GlobalVariable *> TableLookupVariables; + DenseSet<GlobalVariable *> KernelAccessVariables; + DenseSet<GlobalVariable *> DynamicVariables; + partitionVariablesIntoIndirectStrategies( + M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly, + ModuleScopeVariables, TableLookupVariables, KernelAccessVariables, + DynamicVariables); + + // If the kernel accesses a variable that is going to be stored in the + // module instance through a call then that kernel needs to allocate the + // module instance + const DenseSet<Function *> KernelsThatAllocateModuleLDS = + kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, + ModuleScopeVariables); + const DenseSet<Function *> KernelsThatAllocateTableLDS = + kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, + TableLookupVariables); + + const DenseSet<Function *> KernelsThatIndirectlyAllocateDynamicLDS = + kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, + DynamicVariables); + + GlobalVariable *MaybeModuleScopeStruct = lowerModuleScopeStructVariables( + M, ModuleScopeVariables, KernelsThatAllocateModuleLDS); + + DenseMap<Function *, LDSVariableReplacement> KernelToReplacement = + lowerKernelScopeStructVariables(M, LDSUsesInfo, ModuleScopeVariables, + KernelsThatAllocateModuleLDS, + MaybeModuleScopeStruct); // Lower zero cost accesses to the kernel instances just created for (auto &GV : KernelAccessVariables) { @@ -879,72 +1154,113 @@ public: Vec.insert(GV); replaceLDSVariablesWithStruct(M, Vec, Replacement, [](Use &U) { - return isa<Instruction>(U.getUser()); + return isa<Instruction>(U.getUser()); }); } + // The ith element of this vector is kernel id i + std::vector<Function *> OrderedKernels = + assignLDSKernelIDToEachKernel(&M, KernelsThatAllocateTableLDS, + KernelsThatIndirectlyAllocateDynamicLDS); + if (!KernelsThatAllocateTableLDS.empty()) { - // Collect the kernels that allocate table lookup LDS - std::vector<Function *> OrderedKernels; - { - for (Function &Func : M.functions()) { - if (Func.isDeclaration()) - continue; - if (!isKernelLDS(&Func)) - continue; + LLVMContext &Ctx = M.getContext(); + IRBuilder<> Builder(Ctx); - if (KernelsThatAllocateTableLDS.contains(&Func)) { - assert(Func.hasName()); // else fatal error earlier - OrderedKernels.push_back(&Func); - } - } + // The order must be consistent between lookup table and accesses to + // lookup table + auto TableLookupVariablesOrdered = + sortByName(std::vector<GlobalVariable *>(TableLookupVariables.begin(), + TableLookupVariables.end())); + + GlobalVariable *LookupTable = buildLookupTable( + M, TableLookupVariablesOrdered, OrderedKernels, KernelToReplacement); + replaceUsesInInstructionsWithTableLookup(M, TableLookupVariablesOrdered, + LookupTable); + } - // Put them in an arbitrary but reproducible order - llvm::sort(OrderedKernels.begin(), OrderedKernels.end(), - [](const Function *lhs, const Function *rhs) -> bool { - return lhs->getName() < rhs->getName(); - }); + DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS = + lowerDynamicLDSVariables(M, LDSUsesInfo, + KernelsThatIndirectlyAllocateDynamicLDS, + DynamicVariables, OrderedKernels); + + // All kernel frames have been allocated. Calculate and record the + // addresses. + { + const DataLayout &DL = M.getDataLayout(); + + for (Function &Func : M.functions()) { + if (Func.isDeclaration() || !isKernelLDS(&Func)) + continue; - // Annotate the kernels with their order in this vector - LLVMContext &Ctx = M.getContext(); - IRBuilder<> Builder(Ctx); + // All three of these are optional. The first variable is allocated at + // zero. They are allocated by AMDGPUMachineFunction as one block. + // Layout: + //{ + // module.lds + // alignment padding + // kernel instance + // alignment padding + // dynamic lds variables + //} - if (OrderedKernels.size() > UINT32_MAX) { - // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU - report_fatal_error("Unimplemented LDS lowering for > 2**32 kernels"); + const bool AllocateModuleScopeStruct = + MaybeModuleScopeStruct && + KernelsThatAllocateModuleLDS.contains(&Func); + + auto Replacement = KernelToReplacement.find(&Func); + const bool AllocateKernelScopeStruct = + Replacement != KernelToReplacement.end(); + + const bool AllocateDynamicVariable = + KernelToCreatedDynamicLDS.contains(&Func); + + uint32_t Offset = 0; + + if (AllocateModuleScopeStruct) { + // Allocated at zero, recorded once on construction, not once per + // kernel + Offset += DL.getTypeAllocSize(MaybeModuleScopeStruct->getValueType()); } - for (size_t i = 0; i < OrderedKernels.size(); i++) { - Metadata *AttrMDArgs[1] = { - ConstantAsMetadata::get(Builder.getInt32(i)), - }; - OrderedKernels[i]->setMetadata("llvm.amdgcn.lds.kernel.id", - MDNode::get(Ctx, AttrMDArgs)); + if (AllocateKernelScopeStruct) { + GlobalVariable *KernelStruct = Replacement->second.SGV; + Offset = alignTo(Offset, AMDGPU::getAlign(DL, KernelStruct)); + recordLDSAbsoluteAddress(&M, KernelStruct, Offset); + Offset += DL.getTypeAllocSize(KernelStruct->getValueType()); + } - markUsedByKernel(Builder, OrderedKernels[i], - KernelToReplacement[OrderedKernels[i]].SGV); + // If there is dynamic allocation, the alignment needed is included in + // the static frame size. There may be no reference to the dynamic + // variable in the kernel itself, so without including it here, that + // alignment padding could be missed. + if (AllocateDynamicVariable) { + GlobalVariable *DynamicVariable = KernelToCreatedDynamicLDS[&Func]; + Offset = alignTo(Offset, AMDGPU::getAlign(DL, DynamicVariable)); + recordLDSAbsoluteAddress(&M, DynamicVariable, Offset); } - } - // The order must be consistent between lookup table and accesses to - // lookup table - std::vector<GlobalVariable *> TableLookupVariablesOrdered( - TableLookupVariables.begin(), TableLookupVariables.end()); - llvm::sort(TableLookupVariablesOrdered.begin(), - TableLookupVariablesOrdered.end(), - [](const GlobalVariable *lhs, const GlobalVariable *rhs) { - return lhs->getName() < rhs->getName(); - }); + if (Offset != 0) { + std::string Buffer; + raw_string_ostream SS{Buffer}; + SS << format("%u", Offset); - GlobalVariable *LookupTable = buildLookupTable( - M, TableLookupVariablesOrdered, OrderedKernels, KernelToReplacement); - replaceUsesInInstructionsWithTableLookup(M, TableLookupVariablesOrdered, - LookupTable); + // Instead of explictly marking kernels that access dynamic variables + // using special case metadata, annotate with min-lds == max-lds, i.e. + // that there is no more space available for allocating more static + // LDS variables. That is the right condition to prevent allocating + // more variables which would collide with the addresses assigned to + // dynamic variables. + if (AllocateDynamicVariable) + SS << format(",%u", Offset); + + Func.addFnAttr("amdgpu-lds-size", Buffer); + } + } } for (auto &GV : make_early_inc_range(M.globals())) if (AMDGPU::isLDSVariableToLower(GV)) { - // probably want to remove from used lists GV.removeDeadConstantUsers(); if (GV.use_empty()) @@ -1017,12 +1333,9 @@ private: // The order of fields in this struct depends on the order of // varables in the argument which varies when changing how they // are identified, leading to spurious test breakage. - std::vector<GlobalVariable *> Sorted(LDSVarsToTransform.begin(), - LDSVarsToTransform.end()); - llvm::sort(Sorted.begin(), Sorted.end(), - [](const GlobalVariable *lhs, const GlobalVariable *rhs) { - return lhs->getName() < rhs->getName(); - }); + auto Sorted = sortByName(std::vector<GlobalVariable *>( + LDSVarsToTransform.begin(), LDSVarsToTransform.end())); + for (GlobalVariable *GV : Sorted) { OptimizedStructLayoutField F(GV, DL.getTypeAllocSize(GV->getValueType()), @@ -1101,21 +1414,17 @@ private: } template <typename PredicateTy> - void replaceLDSVariablesWithStruct( + static void replaceLDSVariablesWithStruct( Module &M, DenseSet<GlobalVariable *> const &LDSVarsToTransformArg, - LDSVariableReplacement Replacement, PredicateTy Predicate) { + const LDSVariableReplacement &Replacement, PredicateTy Predicate) { LLVMContext &Ctx = M.getContext(); const DataLayout &DL = M.getDataLayout(); // A hack... we need to insert the aliasing info in a predictable order for // lit tests. Would like to have them in a stable order already, ideally the // same order they get allocated, which might mean an ordered set container - std::vector<GlobalVariable *> LDSVarsToTransform( - LDSVarsToTransformArg.begin(), LDSVarsToTransformArg.end()); - llvm::sort(LDSVarsToTransform.begin(), LDSVarsToTransform.end(), - [](const GlobalVariable *lhs, const GlobalVariable *rhs) { - return lhs->getName() < rhs->getName(); - }); + auto LDSVarsToTransform = sortByName(std::vector<GlobalVariable *>( + LDSVarsToTransformArg.begin(), LDSVarsToTransformArg.end())); // Create alias.scope and their lists. Each field in the new structure // does not alias with all other fields. @@ -1137,7 +1446,7 @@ private: // field of the instance that will be allocated by AMDGPUMachineFunction for (size_t I = 0; I < NumberVars; I++) { GlobalVariable *GV = LDSVarsToTransform[I]; - Constant *GEP = Replacement.LDSVarsToConstantGEP[GV]; + Constant *GEP = Replacement.LDSVarsToConstantGEP.at(GV); GV->replaceUsesWithIf(GEP, Predicate); @@ -1159,9 +1468,9 @@ private: } } - void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL, - MDNode *AliasScope, MDNode *NoAlias, - unsigned MaxDepth = 5) { + static void refineUsesAlignmentAndAA(Value *Ptr, Align A, + const DataLayout &DL, MDNode *AliasScope, + MDNode *NoAlias, unsigned MaxDepth = 5) { if (!MaxDepth || (A == 1 && !AliasScope)) return; |
