1 files changed, 136 insertions, 76 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index b4a8766d682e..56a9a30bc59a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -29,6 +29,8 @@
 #include "AMDGPU.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "Utils/AMDGPUMemoryUtils.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/IR/Constants.h"
@@ -43,6 +45,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/OptimizedStructLayout.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <tuple>
 #include <vector>
 
 #define DEBUG_TYPE "amdgpu-lower-module-lds"
@@ -97,6 +100,9 @@ class AMDGPULowerModuleLDS : public ModulePass {
   static void
   removeFromUsedLists(Module &M,
                       const std::vector<GlobalVariable *> &LocalVars) {
+    // The verifier rejects used lists containing an inttoptr of a constant
+    // so remove the variables from these lists before replaceAllUsesWith
+
     SmallPtrSet<Constant *, 32> LocalVarsSet;
     for (GlobalVariable *LocalVar : LocalVars)
       if (Constant *C = dyn_cast<Constant>(LocalVar->stripPointerCasts()))
@@ -146,12 +152,59 @@ public:
   }
 
   bool runOnModule(Module &M) override {
+    LLVMContext &Ctx = M.getContext();
     CallGraph CG = CallGraph(M);
     bool Changed = superAlignLDSGlobals(M);
+
+    // Move variables used by functions into amdgcn.module.lds
     std::vector<GlobalVariable *> ModuleScopeVariables =
         AMDGPU::findVariablesToLower(M, nullptr);
-    Changed |= processUsedLDS(CG, M, ModuleScopeVariables);
+    if (!ModuleScopeVariables.empty()) {
+      std::string VarName = "llvm.amdgcn.module.lds";
+
+      GlobalVariable *SGV;
+      DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP;
+      std::tie(SGV, LDSVarsToConstantGEP) =
+          createLDSVariableReplacement(M, VarName, ModuleScopeVariables);
+
+      appendToCompilerUsed(
+          M, {static_cast<GlobalValue *>(
+                 ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+                     cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))});
+
+      removeFromUsedLists(M, ModuleScopeVariables);
+      replaceLDSVariablesWithStruct(M, ModuleScopeVariables, SGV,
+                                    LDSVarsToConstantGEP,
+                                    [](Use &) { return true; });
+
+      // This ensures the variable is allocated when called functions access it.
+      // It also lets other passes, specifically PromoteAlloca, accurately
+      // calculate how much LDS will be used by the kernel after lowering.
 
+      IRBuilder<> Builder(Ctx);
+      for (Function &Func : M.functions()) {
+        if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) {
+          const CallGraphNode *N = CG[&Func];
+          const bool CalleesRequireModuleLDS = N->size() > 0;
+
+          if (CalleesRequireModuleLDS) {
+            // If a function this kernel might call requires module LDS,
+            // annotate the kernel to let later passes know it will allocate
+            // this structure, even if not apparent from the IR.
+            markUsedByKernel(Builder, &Func, SGV);
+          } else {
+            // However if we are certain this kernel cannot call a function that
+            // requires module LDS, annotate the kernel so the backend can elide
+            // the allocation without repeating callgraph walks.
+            Func.addFnAttr("amdgpu-elide-module-lds");
+          }
+        }
+      }
+
+      Changed = true;
+    }
+
+    // Move variables used by kernels into per-kernel instances
     for (Function &F : M.functions()) {
       if (F.isDeclaration())
         continue;
@@ -159,9 +212,37 @@ public:
       // Only lower compute kernels' LDS.
       if (!AMDGPU::isKernel(F.getCallingConv()))
         continue;
+
       std::vector<GlobalVariable *> KernelUsedVariables =
           AMDGPU::findVariablesToLower(M, &F);
-      Changed |= processUsedLDS(CG, M, KernelUsedVariables, &F);
+
+      // Replace all constant uses with instructions if they belong to the
+      // current kernel. Unnecessary, removing will cause test churn.
+      for (size_t I = 0; I < KernelUsedVariables.size(); I++) {
+        GlobalVariable *GV = KernelUsedVariables[I];
+        for (User *U : make_early_inc_range(GV->users())) {
+          if (ConstantExpr *C = dyn_cast<ConstantExpr>(U))
+            AMDGPU::replaceConstantUsesInFunction(C, &F);
+        }
+        GV->removeDeadConstantUsers();
+      }
+
+      if (!KernelUsedVariables.empty()) {
+        std::string VarName =
+            (Twine("llvm.amdgcn.kernel.") + F.getName() + ".lds").str();
+        GlobalVariable *SGV;
+        DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP;
+        std::tie(SGV, LDSVarsToConstantGEP) =
+            createLDSVariableReplacement(M, VarName, KernelUsedVariables);
+
+        removeFromUsedLists(M, KernelUsedVariables);
+        replaceLDSVariablesWithStruct(
+            M, KernelUsedVariables, SGV, LDSVarsToConstantGEP, [&F](Use &U) {
+              Instruction *I = dyn_cast<Instruction>(U.getUser());
+              return I && I->getFunction() == &F;
+            });
+        Changed = true;
+      }
     }
 
     return Changed;
@@ -212,16 +293,18 @@ private:
     return Changed;
   }
 
-  bool processUsedLDS(CallGraph const &CG, Module &M,
-                      std::vector<GlobalVariable *> const &LDSVarsToTransform,
-                      Function *F = nullptr) {
+  std::tuple<GlobalVariable *, DenseMap<GlobalVariable *, Constant *>>
+  createLDSVariableReplacement(
+      Module &M, std::string VarName,
+      std::vector<GlobalVariable *> const &LDSVarsToTransform) {
+    // Create a struct instance containing LDSVarsToTransform and map from those
+    // variables to ConstantExprGEP
+    // Variables may be introduced to meet alignment requirements. No aliasing
+    // metadata is useful for these as they have no uses. Erased before return.
+
     LLVMContext &Ctx = M.getContext();
     const DataLayout &DL = M.getDataLayout();
-
-    if (LDSVarsToTransform.empty()) {
-      // No variables to rewrite, no changes made.
-      return false;
-    }
+    assert(!LDSVarsToTransform.empty());
 
     SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
     LayoutFields.reserve(LDSVarsToTransform.size());
@@ -234,9 +317,10 @@ private:
     performOptimizedStructLayout(LayoutFields);
 
     std::vector<GlobalVariable *> LocalVars;
+    BitVector IsPaddingField;
     LocalVars.reserve(LDSVarsToTransform.size()); // will be at least this large
+    IsPaddingField.reserve(LDSVarsToTransform.size());
     {
-      // This usually won't need to insert any padding, perhaps avoid the alloc
       uint64_t CurrentOffset = 0;
       for (size_t I = 0; I < LayoutFields.size(); I++) {
         GlobalVariable *FGV = static_cast<GlobalVariable *>(
@@ -256,10 +340,12 @@ private:
               M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy),
               "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
               false));
+          IsPaddingField.push_back(true);
           CurrentOffset += Padding;
         }
 
         LocalVars.push_back(FGV);
+        IsPaddingField.push_back(false);
         CurrentOffset += LayoutFields[I].Size;
       }
     }
@@ -270,9 +356,6 @@ private:
         LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes),
         [](const GlobalVariable *V) -> Type * { return V->getValueType(); });
 
-    std::string VarName(
-        F ? (Twine("llvm.amdgcn.kernel.") + F->getName() + ".lds").str()
-          : "llvm.amdgcn.module.lds");
     StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t");
 
     Align StructAlign =
@@ -283,62 +366,65 @@ private:
         VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
         false);
     SGV->setAlignment(StructAlign);
-    if (!F) {
-      appendToCompilerUsed(
-          M, {static_cast<GlobalValue *>(
-                 ConstantExpr::getPointerBitCastOrAddrSpaceCast(
-                     cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))});
+
+    DenseMap<GlobalVariable *, Constant *> Map;
+    Type *I32 = Type::getInt32Ty(Ctx);
+    for (size_t I = 0; I < LocalVars.size(); I++) {
+      GlobalVariable *GV = LocalVars[I];
+      Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
+      Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx, true);
+      if (IsPaddingField[I]) {
+        assert(GV->use_empty());
+        GV->eraseFromParent();
+      } else {
+        Map[GV] = GEP;
+      }
     }
+    assert(Map.size() == LDSVarsToTransform.size());
+    return {SGV, std::move(Map)};
+  }
 
-    // The verifier rejects used lists containing an inttoptr of a constant
-    // so remove the variables from these lists before replaceAllUsesWith
-    removeFromUsedLists(M, LocalVars);
+  template <typename PredicateTy>
+  void replaceLDSVariablesWithStruct(
+      Module &M, std::vector<GlobalVariable *> const &LDSVarsToTransform,
+      GlobalVariable *SGV,
+      DenseMap<GlobalVariable *, Constant *> &LDSVarsToConstantGEP,
+      PredicateTy Predicate) {
+    LLVMContext &Ctx = M.getContext();
+    const DataLayout &DL = M.getDataLayout();
 
     // Create alias.scope and their lists. Each field in the new structure
     // does not alias with all other fields.
     SmallVector<MDNode *> AliasScopes;
     SmallVector<Metadata *> NoAliasList;
-    if (LocalVars.size() > 1) {
+    const size_t NumberVars = LDSVarsToTransform.size();
+    if (NumberVars > 1) {
       MDBuilder MDB(Ctx);
-      AliasScopes.reserve(LocalVars.size());
+      AliasScopes.reserve(NumberVars);
       MDNode *Domain = MDB.createAnonymousAliasScopeDomain();
-      for (size_t I = 0; I < LocalVars.size(); I++) {
+      for (size_t I = 0; I < NumberVars; I++) {
         MDNode *Scope = MDB.createAnonymousAliasScope(Domain);
         AliasScopes.push_back(Scope);
       }
       NoAliasList.append(&AliasScopes[1], AliasScopes.end());
     }
 
-    // Replace uses of ith variable with a constantexpr to the ith field of the
-    // instance that will be allocated by AMDGPUMachineFunction
-    Type *I32 = Type::getInt32Ty(Ctx);
-    for (size_t I = 0; I < LocalVars.size(); I++) {
-      GlobalVariable *GV = LocalVars[I];
-      Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
-      Constant *GEP = ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx);
-      if (F) {
-        // Replace all constant uses with instructions if they belong to the
-        // current kernel.
-        for (User *U : make_early_inc_range(GV->users())) {
-          if (ConstantExpr *C = dyn_cast<ConstantExpr>(U))
-            AMDGPU::replaceConstantUsesInFunction(C, F);
-        }
-
-        GV->removeDeadConstantUsers();
+    // Replace uses of ith variable with a constantexpr to the corresponding
+    // field of the instance that will be allocated by AMDGPUMachineFunction
+    for (size_t I = 0; I < NumberVars; I++) {
+      GlobalVariable *GV = LDSVarsToTransform[I];
+      Constant *GEP = LDSVarsToConstantGEP[GV];
 
-        GV->replaceUsesWithIf(GEP, [F](Use &U) {
-          Instruction *I = dyn_cast<Instruction>(U.getUser());
-          return I && I->getFunction() == F;
-        });
-      } else {
-        GV->replaceAllUsesWith(GEP);
-      }
+      GV->replaceUsesWithIf(GEP, Predicate);
       if (GV->use_empty()) {
         GV->eraseFromParent();
       }
 
-      uint64_t Off = DL.getStructLayout(LDSTy)->getElementOffset(I);
-      Align A = commonAlignment(StructAlign, Off);
+      APInt APOff(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
+      GEP->stripAndAccumulateInBoundsConstantOffsets(DL, APOff);
+      uint64_t Offset = APOff.getZExtValue();
+
+      Align A = commonAlignment(SGV->getAlign().valueOrOne(), Offset);
 
       if (I)
         NoAliasList[I - 1] = AliasScopes[I - 1];
@@ -349,32 +435,6 @@ private:
 
       refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias);
     }
-
-    // This ensures the variable is allocated when called functions access it.
-    // It also lets other passes, specifically PromoteAlloca, accurately
-    // calculate how much LDS will be used by the kernel after lowering.
-    if (!F) {
-      IRBuilder<> Builder(Ctx);
-      for (Function &Func : M.functions()) {
-        if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) {
-          const CallGraphNode *N = CG[&Func];
-          const bool CalleesRequireModuleLDS = N->size() > 0;
-
-          if (CalleesRequireModuleLDS) {
-            // If a function this kernel might call requires module LDS,
-            // annotate the kernel to let later passes know it will allocate
-            // this structure, even if not apparent from the IR.
-            markUsedByKernel(Builder, &Func, SGV);
-          } else {
-            // However if we are certain this kernel cannot call a function that
-            // requires module LDS, annotate the kernel so the backend can elide
-            // the allocation without repeating callgraph walks.
-            Func.addFnAttr("amdgpu-elide-module-lds");
-          }
-        }
-      }
-    }
-    return true;
   }
 
   void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL,