1 files changed, 205 insertions, 50 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 14958a180ce3..727f71b35049 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -76,6 +76,11 @@ static cl::opt<bool> DisablePromoteAllocaToLDS(
   cl::desc("Disable promote alloca to LDS"),
   cl::init(false));
 
+static cl::opt<unsigned> PromoteAllocaToVectorLimit(
+  "amdgpu-promote-alloca-to-vector-limit",
+  cl::desc("Maximum byte size to consider promote alloca to vector"),
+  cl::init(0));
+
 // FIXME: This can create globals so should be a module pass.
 class AMDGPUPromoteAlloca : public FunctionPass {
 private:
@@ -86,6 +91,7 @@ private:
   // FIXME: This should be per-kernel.
   uint32_t LocalMemLimit = 0;
   uint32_t CurrentLocalMemUsage = 0;
+  unsigned MaxVGPRs;
 
   bool IsAMDGCN = false;
   bool IsAMDHSA = false;
@@ -128,14 +134,42 @@ public:
   }
 };
 
+class AMDGPUPromoteAllocaToVector : public FunctionPass {
+private:
+  unsigned MaxVGPRs;
+
+public:
+  static char ID;
+
+  AMDGPUPromoteAllocaToVector() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+
+  StringRef getPassName() const override {
+    return "AMDGPU Promote Alloca to vector";
+  }
+
+  bool handleAlloca(AllocaInst &I);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+};
+
 } // end anonymous namespace
 
 char AMDGPUPromoteAlloca::ID = 0;
+char AMDGPUPromoteAllocaToVector::ID = 0;
 
 INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
                 "AMDGPU promote alloca to vector or LDS", false, false)
 
+INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
+                "AMDGPU promote alloca to vector", false, false)
+
 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
+char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
 
 bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
   Mod = &M;
@@ -161,6 +195,13 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
   if (!ST.isPromoteAllocaEnabled())
     return false;
 
+  if (IsAMDGCN) {
+    const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+    MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
+  } else {
+    MaxVGPRs = 128;
+  }
+
   bool SufficientLDS = hasSufficientLocalMem(F);
   bool Changed = false;
   BasicBlock &EntryBB = *F.begin();
@@ -251,10 +292,10 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
   // 32-bit and extract sequence is already present, and it is probably easier
   // to CSE this. The loads should be mergable later anyway.
   Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 1);
-  LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, 4);
+  LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
 
   Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 2);
-  LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, 4);
+  LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
 
   MDNode *MD = MDNode::get(Mod->getContext(), None);
   LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
@@ -297,15 +338,26 @@ Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
   return CI;
 }
 
-static VectorType *arrayTypeToVecType(ArrayType *ArrayTy) {
-  return VectorType::get(ArrayTy->getElementType(),
-                         ArrayTy->getNumElements());
+static FixedVectorType *arrayTypeToVecType(ArrayType *ArrayTy) {
+  return FixedVectorType::get(ArrayTy->getElementType(),
+                              ArrayTy->getNumElements());
+}
+
+static Value *stripBitcasts(Value *V) {
+  while (Instruction *I = dyn_cast<Instruction>(V)) {
+    if (I->getOpcode() != Instruction::BitCast)
+      break;
+    V = I->getOperand(0);
+  }
+  return V;
 }
 
 static Value *
 calculateVectorIndex(Value *Ptr,
                      const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
-  GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
+  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(stripBitcasts(Ptr));
+  if (!GEP)
+    return nullptr;
 
   auto I = GEPIdx.find(GEP);
   return I == GEPIdx.end() ? nullptr : I->second;
@@ -327,7 +379,8 @@ static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
 //
 // TODO: Check isTriviallyVectorizable for calls and handle other
 // instructions.
-static bool canVectorizeInst(Instruction *Inst, User *User) {
+static bool canVectorizeInst(Instruction *Inst, User *User,
+                             const DataLayout &DL) {
   switch (Inst->getOpcode()) {
   case Instruction::Load: {
     // Currently only handle the case where the Pointer Operand is a GEP.
@@ -337,7 +390,14 @@ static bool canVectorizeInst(Instruction *Inst, User *User) {
         LI->getPointerOperandType() == User->getType() &&
         isa<VectorType>(LI->getType()))
       return true;
-    return isa<GetElementPtrInst>(LI->getPointerOperand()) && LI->isSimple();
+
+    Instruction *PtrInst = dyn_cast<Instruction>(LI->getPointerOperand());
+    if (!PtrInst)
+      return false;
+
+    return (PtrInst->getOpcode() == Instruction::GetElementPtr ||
+            PtrInst->getOpcode() == Instruction::BitCast) &&
+           LI->isSimple();
   }
   case Instruction::BitCast:
     return true;
@@ -350,22 +410,46 @@ static bool canVectorizeInst(Instruction *Inst, User *User) {
         SI->getPointerOperandType() == User->getType() &&
         isa<VectorType>(SI->getValueOperand()->getType()))
       return true;
-    return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && SI->isSimple();
+
+    Instruction *UserInst = dyn_cast<Instruction>(User);
+    if (!UserInst)
+      return false;
+
+    return (SI->getPointerOperand() == User) &&
+           (UserInst->getOpcode() == Instruction::GetElementPtr ||
+            UserInst->getOpcode() == Instruction::BitCast) &&
+           SI->isSimple();
   }
   default:
     return false;
   }
 }
 
-static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
+static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
+                                     unsigned MaxVGPRs) {
 
   if (DisablePromoteAllocaToVector) {
     LLVM_DEBUG(dbgs() << "  Promotion alloca to vector is disabled\n");
     return false;
   }
 
-  Type *AT = Alloca->getAllocatedType();
-  SequentialType *AllocaTy = dyn_cast<SequentialType>(AT);
+  Type *AllocaTy = Alloca->getAllocatedType();
+  auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
+  if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
+    if (VectorType::isValidElementType(ArrayTy->getElementType()) &&
+        ArrayTy->getNumElements() > 0)
+      VectorTy = arrayTypeToVecType(ArrayTy);
+  }
+
+  // Use up to 1/4 of available register budget for vectorization.
+  unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
+                                              : (MaxVGPRs * 32);
+
+  if (DL.getTypeSizeInBits(AllocaTy) * 4 > Limit) {
+    LLVM_DEBUG(dbgs() << "  Alloca too big for vectorization with "
+                      << MaxVGPRs << " registers available\n");
+    return false;
+  }
 
   LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
 
@@ -373,22 +457,44 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
   // are just being conservative for now.
   // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these
   // could also be promoted but we don't currently handle this case
-  if (!AllocaTy ||
-      AllocaTy->getNumElements() > 16 ||
-      AllocaTy->getNumElements() < 2 ||
-      !VectorType::isValidElementType(AllocaTy->getElementType())) {
+  if (!VectorTy || VectorTy->getNumElements() > 16 ||
+      VectorTy->getNumElements() < 2) {
     LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
     return false;
   }
 
   std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
-  std::vector<Value*> WorkList;
-  for (User *AllocaUser : Alloca->users()) {
+  std::vector<Value *> WorkList;
+  SmallVector<User *, 8> Users(Alloca->users());
+  SmallVector<User *, 8> UseUsers(Users.size(), Alloca);
+  Type *VecEltTy = VectorTy->getElementType();
+  while (!Users.empty()) {
+    User *AllocaUser = Users.pop_back_val();
+    User *UseUser = UseUsers.pop_back_val();
+    Instruction *Inst = dyn_cast<Instruction>(AllocaUser);
+
     GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
     if (!GEP) {
-      if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
+      if (!canVectorizeInst(Inst, UseUser, DL))
         return false;
 
+      if (Inst->getOpcode() == Instruction::BitCast) {
+        Type *FromTy = Inst->getOperand(0)->getType()->getPointerElementType();
+        Type *ToTy = Inst->getType()->getPointerElementType();
+        if (FromTy->isAggregateType() || ToTy->isAggregateType() ||
+            DL.getTypeSizeInBits(FromTy) != DL.getTypeSizeInBits(ToTy))
+          continue;
+
+        for (User *CastUser : Inst->users()) {
+          if (isAssumeLikeIntrinsic(cast<Instruction>(CastUser)))
+            continue;
+          Users.push_back(CastUser);
+          UseUsers.push_back(Inst);
+        }
+
+        continue;
+      }
+
       WorkList.push_back(AllocaUser);
       continue;
     }
@@ -404,18 +510,10 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
     }
 
     GEPVectorIdx[GEP] = Index;
-    for (User *GEPUser : AllocaUser->users()) {
-      if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
-        return false;
-
-      WorkList.push_back(GEPUser);
-    }
+    Users.append(GEP->user_begin(), GEP->user_end());
+    UseUsers.append(GEP->getNumUses(), GEP);
   }
 
-  VectorType *VectorTy = dyn_cast<VectorType>(AllocaTy);
-  if (!VectorTy)
-    VectorTy = arrayTypeToVecType(cast<ArrayType>(AllocaTy));
-
   LLVM_DEBUG(dbgs() << "  Converting alloca to vector " << *AllocaTy << " -> "
                     << *VectorTy << '\n');
 
@@ -424,40 +522,46 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
     IRBuilder<> Builder(Inst);
     switch (Inst->getOpcode()) {
     case Instruction::Load: {
-      if (Inst->getType() == AT)
+      if (Inst->getType() == AllocaTy || Inst->getType()->isVectorTy())
         break;
 
-      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
       Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+      if (!Index)
+        break;
 
+      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
       Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
       Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
+      if (Inst->getType() != VecEltTy)
+        ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType());
       Inst->replaceAllUsesWith(ExtractElement);
       Inst->eraseFromParent();
       break;
     }
     case Instruction::Store: {
       StoreInst *SI = cast<StoreInst>(Inst);
-      if (SI->getValueOperand()->getType() == AT)
+      if (SI->getValueOperand()->getType() == AllocaTy ||
+          SI->getValueOperand()->getType()->isVectorTy())
         break;
 
-      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
       Value *Ptr = SI->getPointerOperand();
       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+      if (!Index)
+        break;
+
+      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
       Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
-      Value *NewVecValue = Builder.CreateInsertElement(VecValue,
-                                                       SI->getValueOperand(),
-                                                       Index);
+      Value *Elt = SI->getValueOperand();
+      if (Elt->getType() != VecEltTy)
+        Elt = Builder.CreateBitOrPointerCast(Elt, VecEltTy);
+      Value *NewVecValue = Builder.CreateInsertElement(VecValue, Elt, Index);
       Builder.CreateStore(NewVecValue, BitCast);
       Inst->eraseFromParent();
       break;
     }
-    case Instruction::BitCast:
-    case Instruction::AddrSpaceCast:
-      break;
 
     default:
       llvm_unreachable("Inconsistency in instructions promotable to vector");
@@ -659,16 +763,15 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
         continue;
 
       if (Use->getParent()->getParent() == &F) {
-        unsigned Align = GV.getAlignment();
-        if (Align == 0)
-          Align = DL.getABITypeAlignment(GV.getValueType());
+        Align Alignment =
+            DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType());
 
         // FIXME: Try to account for padding here. The padding is currently
         // determined from the inverse order of uses in the function. I'm not
         // sure if the use list order is in any way connected to this, so the
         // total reported size is likely incorrect.
         uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
-        CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
+        CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alignment);
         CurrentLocalMemUsage += AllocSize;
         break;
       }
@@ -722,6 +825,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   if (!I.isStaticAlloca() || I.isArrayAllocation())
     return false;
 
+  const DataLayout &DL = Mod->getDataLayout();
   IRBuilder<> Builder(&I);
 
   // First try to replace the alloca with a vector
@@ -729,7 +833,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
 
   LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
 
-  if (tryPromoteAllocaToVector(&I))
+  if (tryPromoteAllocaToVector(&I, DL, MaxVGPRs))
     return true; // Promoted to vector.
 
   if (DisablePromoteAllocaToLDS)
@@ -759,11 +863,8 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, ContainingFunction);
   unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
 
-  const DataLayout &DL = Mod->getDataLayout();
-
-  unsigned Align = I.getAlignment();
-  if (Align == 0)
-    Align = DL.getABITypeAlignment(I.getAllocatedType());
+  Align Alignment =
+      DL.getValueOrABITypeAlignment(I.getAlign(), I.getAllocatedType());
 
   // FIXME: This computed padding is likely wrong since it depends on inverse
   // usage order.
@@ -771,7 +872,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   // FIXME: It is also possible that if we're allowed to use all of the memory
   // could could end up using more than the maximum due to alignment padding.
 
-  uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align);
+  uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
   uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
   NewSize += AllocSize;
 
@@ -938,6 +1039,60 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   return true;
 }
 
+bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
+  if (skipFunction(F) || DisablePromoteAllocaToVector)
+    return false;
+
+  const TargetMachine *TM;
+  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
+    TM = &TPC->getTM<TargetMachine>();
+  else
+    return false;
+
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
+  if (!ST.isPromoteAllocaEnabled())
+    return false;
+
+  if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
+    const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+    MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
+  } else {
+    MaxVGPRs = 128;
+  }
+
+  bool Changed = false;
+  BasicBlock &EntryBB = *F.begin();
+
+  SmallVector<AllocaInst *, 16> Allocas;
+  for (Instruction &I : EntryBB) {
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
+      Allocas.push_back(AI);
+  }
+
+  for (AllocaInst *AI : Allocas) {
+    if (handleAlloca(*AI))
+      Changed = true;
+  }
+
+  return Changed;
+}
+
+bool AMDGPUPromoteAllocaToVector::handleAlloca(AllocaInst &I) {
+  // Array allocations are probably not worth handling, since an allocation of
+  // the array type is the canonical form.
+  if (!I.isStaticAlloca() || I.isArrayAllocation())
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
+
+  Module *Mod = I.getParent()->getParent()->getParent();
+  return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs);
+}
+
 FunctionPass *llvm::createAMDGPUPromoteAlloca() {
   return new AMDGPUPromoteAlloca();
 }
+
+FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
+  return new AMDGPUPromoteAllocaToVector();
+}