aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp212
1 files changed, 136 insertions, 76 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index b4a8766d682e..56a9a30bc59a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -29,6 +29,8 @@
#include "AMDGPU.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "Utils/AMDGPUMemoryUtils.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/CallGraph.h"
#include "llvm/IR/Constants.h"
@@ -43,6 +45,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/OptimizedStructLayout.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <tuple>
#include <vector>
#define DEBUG_TYPE "amdgpu-lower-module-lds"
@@ -97,6 +100,9 @@ class AMDGPULowerModuleLDS : public ModulePass {
static void
removeFromUsedLists(Module &M,
const std::vector<GlobalVariable *> &LocalVars) {
+ // The verifier rejects used lists containing an inttoptr of a constant
+ // so remove the variables from these lists before replaceAllUsesWith
+
SmallPtrSet<Constant *, 32> LocalVarsSet;
for (GlobalVariable *LocalVar : LocalVars)
if (Constant *C = dyn_cast<Constant>(LocalVar->stripPointerCasts()))
@@ -146,12 +152,59 @@ public:
}
bool runOnModule(Module &M) override {
+ LLVMContext &Ctx = M.getContext();
CallGraph CG = CallGraph(M);
bool Changed = superAlignLDSGlobals(M);
+
+ // Move variables used by functions into amdgcn.module.lds
std::vector<GlobalVariable *> ModuleScopeVariables =
AMDGPU::findVariablesToLower(M, nullptr);
- Changed |= processUsedLDS(CG, M, ModuleScopeVariables);
+ if (!ModuleScopeVariables.empty()) {
+ std::string VarName = "llvm.amdgcn.module.lds";
+
+ GlobalVariable *SGV;
+ DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP;
+ std::tie(SGV, LDSVarsToConstantGEP) =
+ createLDSVariableReplacement(M, VarName, ModuleScopeVariables);
+
+ appendToCompilerUsed(
+ M, {static_cast<GlobalValue *>(
+ ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+ cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))});
+
+ removeFromUsedLists(M, ModuleScopeVariables);
+ replaceLDSVariablesWithStruct(M, ModuleScopeVariables, SGV,
+ LDSVarsToConstantGEP,
+ [](Use &) { return true; });
+
+ // This ensures the variable is allocated when called functions access it.
+ // It also lets other passes, specifically PromoteAlloca, accurately
+ // calculate how much LDS will be used by the kernel after lowering.
+ IRBuilder<> Builder(Ctx);
+ for (Function &Func : M.functions()) {
+ if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) {
+ const CallGraphNode *N = CG[&Func];
+ const bool CalleesRequireModuleLDS = N->size() > 0;
+
+ if (CalleesRequireModuleLDS) {
+ // If a function this kernel might call requires module LDS,
+ // annotate the kernel to let later passes know it will allocate
+ // this structure, even if not apparent from the IR.
+ markUsedByKernel(Builder, &Func, SGV);
+ } else {
+ // However if we are certain this kernel cannot call a function that
+ // requires module LDS, annotate the kernel so the backend can elide
+ // the allocation without repeating callgraph walks.
+ Func.addFnAttr("amdgpu-elide-module-lds");
+ }
+ }
+ }
+
+ Changed = true;
+ }
+
+ // Move variables used by kernels into per-kernel instances
for (Function &F : M.functions()) {
if (F.isDeclaration())
continue;
@@ -159,9 +212,37 @@ public:
// Only lower compute kernels' LDS.
if (!AMDGPU::isKernel(F.getCallingConv()))
continue;
+
std::vector<GlobalVariable *> KernelUsedVariables =
AMDGPU::findVariablesToLower(M, &F);
- Changed |= processUsedLDS(CG, M, KernelUsedVariables, &F);
+
+ // Replace all constant uses with instructions if they belong to the
+ // current kernel. Unnecessary, removing will cause test churn.
+ for (size_t I = 0; I < KernelUsedVariables.size(); I++) {
+ GlobalVariable *GV = KernelUsedVariables[I];
+ for (User *U : make_early_inc_range(GV->users())) {
+ if (ConstantExpr *C = dyn_cast<ConstantExpr>(U))
+ AMDGPU::replaceConstantUsesInFunction(C, &F);
+ }
+ GV->removeDeadConstantUsers();
+ }
+
+ if (!KernelUsedVariables.empty()) {
+ std::string VarName =
+ (Twine("llvm.amdgcn.kernel.") + F.getName() + ".lds").str();
+ GlobalVariable *SGV;
+ DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP;
+ std::tie(SGV, LDSVarsToConstantGEP) =
+ createLDSVariableReplacement(M, VarName, KernelUsedVariables);
+
+ removeFromUsedLists(M, KernelUsedVariables);
+ replaceLDSVariablesWithStruct(
+ M, KernelUsedVariables, SGV, LDSVarsToConstantGEP, [&F](Use &U) {
+ Instruction *I = dyn_cast<Instruction>(U.getUser());
+ return I && I->getFunction() == &F;
+ });
+ Changed = true;
+ }
}
return Changed;
@@ -212,16 +293,18 @@ private:
return Changed;
}
- bool processUsedLDS(CallGraph const &CG, Module &M,
- std::vector<GlobalVariable *> const &LDSVarsToTransform,
- Function *F = nullptr) {
+ std::tuple<GlobalVariable *, DenseMap<GlobalVariable *, Constant *>>
+ createLDSVariableReplacement(
+ Module &M, std::string VarName,
+ std::vector<GlobalVariable *> const &LDSVarsToTransform) {
+ // Create a struct instance containing LDSVarsToTransform and map from those
+ // variables to ConstantExprGEP
+ // Variables may be introduced to meet alignment requirements. No aliasing
+ // metadata is useful for these as they have no uses. Erased before return.
+
LLVMContext &Ctx = M.getContext();
const DataLayout &DL = M.getDataLayout();
-
- if (LDSVarsToTransform.empty()) {
- // No variables to rewrite, no changes made.
- return false;
- }
+ assert(!LDSVarsToTransform.empty());
SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
LayoutFields.reserve(LDSVarsToTransform.size());
@@ -234,9 +317,10 @@ private:
performOptimizedStructLayout(LayoutFields);
std::vector<GlobalVariable *> LocalVars;
+ BitVector IsPaddingField;
LocalVars.reserve(LDSVarsToTransform.size()); // will be at least this large
+ IsPaddingField.reserve(LDSVarsToTransform.size());
{
- // This usually won't need to insert any padding, perhaps avoid the alloc
uint64_t CurrentOffset = 0;
for (size_t I = 0; I < LayoutFields.size(); I++) {
GlobalVariable *FGV = static_cast<GlobalVariable *>(
@@ -256,10 +340,12 @@ private:
M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy),
"", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
false));
+ IsPaddingField.push_back(true);
CurrentOffset += Padding;
}
LocalVars.push_back(FGV);
+ IsPaddingField.push_back(false);
CurrentOffset += LayoutFields[I].Size;
}
}
@@ -270,9 +356,6 @@ private:
LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes),
[](const GlobalVariable *V) -> Type * { return V->getValueType(); });
- std::string VarName(
- F ? (Twine("llvm.amdgcn.kernel.") + F->getName() + ".lds").str()
- : "llvm.amdgcn.module.lds");
StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t");
Align StructAlign =
@@ -283,62 +366,65 @@ private:
VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
false);
SGV->setAlignment(StructAlign);
- if (!F) {
- appendToCompilerUsed(
- M, {static_cast<GlobalValue *>(
- ConstantExpr::getPointerBitCastOrAddrSpaceCast(
- cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))});
+
+ DenseMap<GlobalVariable *, Constant *> Map;
+ Type *I32 = Type::getInt32Ty(Ctx);
+ for (size_t I = 0; I < LocalVars.size(); I++) {
+ GlobalVariable *GV = LocalVars[I];
+ Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
+ Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx, true);
+ if (IsPaddingField[I]) {
+ assert(GV->use_empty());
+ GV->eraseFromParent();
+ } else {
+ Map[GV] = GEP;
+ }
}
+ assert(Map.size() == LDSVarsToTransform.size());
+ return {SGV, std::move(Map)};
+ }
- // The verifier rejects used lists containing an inttoptr of a constant
- // so remove the variables from these lists before replaceAllUsesWith
- removeFromUsedLists(M, LocalVars);
+ template <typename PredicateTy>
+ void replaceLDSVariablesWithStruct(
+ Module &M, std::vector<GlobalVariable *> const &LDSVarsToTransform,
+ GlobalVariable *SGV,
+ DenseMap<GlobalVariable *, Constant *> &LDSVarsToConstantGEP,
+ PredicateTy Predicate) {
+ LLVMContext &Ctx = M.getContext();
+ const DataLayout &DL = M.getDataLayout();
// Create alias.scope and their lists. Each field in the new structure
// does not alias with all other fields.
SmallVector<MDNode *> AliasScopes;
SmallVector<Metadata *> NoAliasList;
- if (LocalVars.size() > 1) {
+ const size_t NumberVars = LDSVarsToTransform.size();
+ if (NumberVars > 1) {
MDBuilder MDB(Ctx);
- AliasScopes.reserve(LocalVars.size());
+ AliasScopes.reserve(NumberVars);
MDNode *Domain = MDB.createAnonymousAliasScopeDomain();
- for (size_t I = 0; I < LocalVars.size(); I++) {
+ for (size_t I = 0; I < NumberVars; I++) {
MDNode *Scope = MDB.createAnonymousAliasScope(Domain);
AliasScopes.push_back(Scope);
}
NoAliasList.append(&AliasScopes[1], AliasScopes.end());
}
- // Replace uses of ith variable with a constantexpr to the ith field of the
- // instance that will be allocated by AMDGPUMachineFunction
- Type *I32 = Type::getInt32Ty(Ctx);
- for (size_t I = 0; I < LocalVars.size(); I++) {
- GlobalVariable *GV = LocalVars[I];
- Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
- Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx);
- if (F) {
- // Replace all constant uses with instructions if they belong to the
- // current kernel.
- for (User *U : make_early_inc_range(GV->users())) {
- if (ConstantExpr *C = dyn_cast<ConstantExpr>(U))
- AMDGPU::replaceConstantUsesInFunction(C, F);
- }
-
- GV->removeDeadConstantUsers();
+ // Replace uses of ith variable with a constantexpr to the corresponding
+ // field of the instance that will be allocated by AMDGPUMachineFunction
+ for (size_t I = 0; I < NumberVars; I++) {
+ GlobalVariable *GV = LDSVarsToTransform[I];
+ Constant *GEP = LDSVarsToConstantGEP[GV];
- GV->replaceUsesWithIf(GEP, [F](Use &U) {
- Instruction *I = dyn_cast<Instruction>(U.getUser());
- return I && I->getFunction() == F;
- });
- } else {
- GV->replaceAllUsesWith(GEP);
- }
+ GV->replaceUsesWithIf(GEP, Predicate);
if (GV->use_empty()) {
GV->eraseFromParent();
}
- uint64_t Off = DL.getStructLayout(LDSTy)->getElementOffset(I);
- Align A = commonAlignment(StructAlign, Off);
+ APInt APOff(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
+ GEP->stripAndAccumulateInBoundsConstantOffsets(DL, APOff);
+ uint64_t Offset = APOff.getZExtValue();
+
+ Align A = commonAlignment(SGV->getAlign().valueOrOne(), Offset);
if (I)
NoAliasList[I - 1] = AliasScopes[I - 1];
@@ -349,32 +435,6 @@ private:
refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias);
}
-
- // This ensures the variable is allocated when called functions access it.
- // It also lets other passes, specifically PromoteAlloca, accurately
- // calculate how much LDS will be used by the kernel after lowering.
- if (!F) {
- IRBuilder<> Builder(Ctx);
- for (Function &Func : M.functions()) {
- if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) {
- const CallGraphNode *N = CG[&Func];
- const bool CalleesRequireModuleLDS = N->size() > 0;
-
- if (CalleesRequireModuleLDS) {
- // If a function this kernel might call requires module LDS,
- // annotate the kernel to let later passes know it will allocate
- // this structure, even if not apparent from the IR.
- markUsedByKernel(Builder, &Func, SGV);
- } else {
- // However if we are certain this kernel cannot call a function that
- // requires module LDS, annotate the kernel so the backend can elide
- // the allocation without repeating callgraph walks.
- Func.addFnAttr("amdgpu-elide-module-lds");
- }
- }
- }
- }
- return true;
}
void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL,