aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp255
1 files changed, 205 insertions, 50 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 14958a180ce3..727f71b35049 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -76,6 +76,11 @@ static cl::opt<bool> DisablePromoteAllocaToLDS(
cl::desc("Disable promote alloca to LDS"),
cl::init(false));
+static cl::opt<unsigned> PromoteAllocaToVectorLimit(
+ "amdgpu-promote-alloca-to-vector-limit",
+ cl::desc("Maximum byte size to consider promote alloca to vector"),
+ cl::init(0));
+
// FIXME: This can create globals so should be a module pass.
class AMDGPUPromoteAlloca : public FunctionPass {
private:
@@ -86,6 +91,7 @@ private:
// FIXME: This should be per-kernel.
uint32_t LocalMemLimit = 0;
uint32_t CurrentLocalMemUsage = 0;
+ unsigned MaxVGPRs;
bool IsAMDGCN = false;
bool IsAMDHSA = false;
@@ -128,14 +134,42 @@ public:
}
};
+class AMDGPUPromoteAllocaToVector : public FunctionPass {
+private:
+ unsigned MaxVGPRs;
+
+public:
+ static char ID;
+
+ AMDGPUPromoteAllocaToVector() : FunctionPass(ID) {}
+
+ bool runOnFunction(Function &F) override;
+
+ StringRef getPassName() const override {
+ return "AMDGPU Promote Alloca to vector";
+ }
+
+ bool handleAlloca(AllocaInst &I);
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+};
+
} // end anonymous namespace
char AMDGPUPromoteAlloca::ID = 0;
+char AMDGPUPromoteAllocaToVector::ID = 0;
INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
"AMDGPU promote alloca to vector or LDS", false, false)
+INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
+ "AMDGPU promote alloca to vector", false, false)
+
char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
+char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
Mod = &M;
@@ -161,6 +195,13 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
if (!ST.isPromoteAllocaEnabled())
return false;
+ if (IsAMDGCN) {
+ const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+ MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
+ } else {
+ MaxVGPRs = 128;
+ }
+
bool SufficientLDS = hasSufficientLocalMem(F);
bool Changed = false;
BasicBlock &EntryBB = *F.begin();
@@ -251,10 +292,10 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
// 32-bit and extract sequence is already present, and it is probably easier
// to CSE this. The loads should be mergable later anyway.
Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 1);
- LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, 4);
+ LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 2);
- LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, 4);
+ LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
MDNode *MD = MDNode::get(Mod->getContext(), None);
LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
@@ -297,15 +338,26 @@ Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
return CI;
}
-static VectorType *arrayTypeToVecType(ArrayType *ArrayTy) {
- return VectorType::get(ArrayTy->getElementType(),
- ArrayTy->getNumElements());
+static FixedVectorType *arrayTypeToVecType(ArrayType *ArrayTy) {
+ return FixedVectorType::get(ArrayTy->getElementType(),
+ ArrayTy->getNumElements());
+}
+
+static Value *stripBitcasts(Value *V) {
+ while (Instruction *I = dyn_cast<Instruction>(V)) {
+ if (I->getOpcode() != Instruction::BitCast)
+ break;
+ V = I->getOperand(0);
+ }
+ return V;
}
static Value *
calculateVectorIndex(Value *Ptr,
const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
- GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
+ GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(stripBitcasts(Ptr));
+ if (!GEP)
+ return nullptr;
auto I = GEPIdx.find(GEP);
return I == GEPIdx.end() ? nullptr : I->second;
@@ -327,7 +379,8 @@ static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
//
// TODO: Check isTriviallyVectorizable for calls and handle other
// instructions.
-static bool canVectorizeInst(Instruction *Inst, User *User) {
+static bool canVectorizeInst(Instruction *Inst, User *User,
+ const DataLayout &DL) {
switch (Inst->getOpcode()) {
case Instruction::Load: {
// Currently only handle the case where the Pointer Operand is a GEP.
@@ -337,7 +390,14 @@ static bool canVectorizeInst(Instruction *Inst, User *User) {
LI->getPointerOperandType() == User->getType() &&
isa<VectorType>(LI->getType()))
return true;
- return isa<GetElementPtrInst>(LI->getPointerOperand()) && LI->isSimple();
+
+ Instruction *PtrInst = dyn_cast<Instruction>(LI->getPointerOperand());
+ if (!PtrInst)
+ return false;
+
+ return (PtrInst->getOpcode() == Instruction::GetElementPtr ||
+ PtrInst->getOpcode() == Instruction::BitCast) &&
+ LI->isSimple();
}
case Instruction::BitCast:
return true;
@@ -350,22 +410,46 @@ static bool canVectorizeInst(Instruction *Inst, User *User) {
SI->getPointerOperandType() == User->getType() &&
isa<VectorType>(SI->getValueOperand()->getType()))
return true;
- return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && SI->isSimple();
+
+ Instruction *UserInst = dyn_cast<Instruction>(User);
+ if (!UserInst)
+ return false;
+
+ return (SI->getPointerOperand() == User) &&
+ (UserInst->getOpcode() == Instruction::GetElementPtr ||
+ UserInst->getOpcode() == Instruction::BitCast) &&
+ SI->isSimple();
}
default:
return false;
}
}
-static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
+static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
+ unsigned MaxVGPRs) {
if (DisablePromoteAllocaToVector) {
LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n");
return false;
}
- Type *AT = Alloca->getAllocatedType();
- SequentialType *AllocaTy = dyn_cast<SequentialType>(AT);
+ Type *AllocaTy = Alloca->getAllocatedType();
+ auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
+ if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
+ if (VectorType::isValidElementType(ArrayTy->getElementType()) &&
+ ArrayTy->getNumElements() > 0)
+ VectorTy = arrayTypeToVecType(ArrayTy);
+ }
+
+ // Use up to 1/4 of available register budget for vectorization.
+ unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
+ : (MaxVGPRs * 32);
+
+ if (DL.getTypeSizeInBits(AllocaTy) * 4 > Limit) {
+ LLVM_DEBUG(dbgs() << " Alloca too big for vectorization with "
+ << MaxVGPRs << " registers available\n");
+ return false;
+ }
LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
@@ -373,22 +457,44 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
// are just being conservative for now.
// FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these
// could also be promoted but we don't currently handle this case
- if (!AllocaTy ||
- AllocaTy->getNumElements() > 16 ||
- AllocaTy->getNumElements() < 2 ||
- !VectorType::isValidElementType(AllocaTy->getElementType())) {
+ if (!VectorTy || VectorTy->getNumElements() > 16 ||
+ VectorTy->getNumElements() < 2) {
LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
return false;
}
std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
- std::vector<Value*> WorkList;
- for (User *AllocaUser : Alloca->users()) {
+ std::vector<Value *> WorkList;
+ SmallVector<User *, 8> Users(Alloca->users());
+ SmallVector<User *, 8> UseUsers(Users.size(), Alloca);
+ Type *VecEltTy = VectorTy->getElementType();
+ while (!Users.empty()) {
+ User *AllocaUser = Users.pop_back_val();
+ User *UseUser = UseUsers.pop_back_val();
+ Instruction *Inst = dyn_cast<Instruction>(AllocaUser);
+
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
if (!GEP) {
- if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
+ if (!canVectorizeInst(Inst, UseUser, DL))
return false;
+ if (Inst->getOpcode() == Instruction::BitCast) {
+ Type *FromTy = Inst->getOperand(0)->getType()->getPointerElementType();
+ Type *ToTy = Inst->getType()->getPointerElementType();
+ if (FromTy->isAggregateType() || ToTy->isAggregateType() ||
+ DL.getTypeSizeInBits(FromTy) != DL.getTypeSizeInBits(ToTy))
+ continue;
+
+ for (User *CastUser : Inst->users()) {
+ if (isAssumeLikeIntrinsic(cast<Instruction>(CastUser)))
+ continue;
+ Users.push_back(CastUser);
+ UseUsers.push_back(Inst);
+ }
+
+ continue;
+ }
+
WorkList.push_back(AllocaUser);
continue;
}
@@ -404,18 +510,10 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
}
GEPVectorIdx[GEP] = Index;
- for (User *GEPUser : AllocaUser->users()) {
- if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
- return false;
-
- WorkList.push_back(GEPUser);
- }
+ Users.append(GEP->user_begin(), GEP->user_end());
+ UseUsers.append(GEP->getNumUses(), GEP);
}
- VectorType *VectorTy = dyn_cast<VectorType>(AllocaTy);
- if (!VectorTy)
- VectorTy = arrayTypeToVecType(cast<ArrayType>(AllocaTy));
-
LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "
<< *VectorTy << '\n');
@@ -424,40 +522,46 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
IRBuilder<> Builder(Inst);
switch (Inst->getOpcode()) {
case Instruction::Load: {
- if (Inst->getType() == AT)
+ if (Inst->getType() == AllocaTy || Inst->getType()->isVectorTy())
break;
- Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+ if (!Index)
+ break;
+ Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
+ if (Inst->getType() != VecEltTy)
+ ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType());
Inst->replaceAllUsesWith(ExtractElement);
Inst->eraseFromParent();
break;
}
case Instruction::Store: {
StoreInst *SI = cast<StoreInst>(Inst);
- if (SI->getValueOperand()->getType() == AT)
+ if (SI->getValueOperand()->getType() == AllocaTy ||
+ SI->getValueOperand()->getType()->isVectorTy())
break;
- Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
Value *Ptr = SI->getPointerOperand();
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+ if (!Index)
+ break;
+
+ Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
- Value *NewVecValue = Builder.CreateInsertElement(VecValue,
- SI->getValueOperand(),
- Index);
+ Value *Elt = SI->getValueOperand();
+ if (Elt->getType() != VecEltTy)
+ Elt = Builder.CreateBitOrPointerCast(Elt, VecEltTy);
+ Value *NewVecValue = Builder.CreateInsertElement(VecValue, Elt, Index);
Builder.CreateStore(NewVecValue, BitCast);
Inst->eraseFromParent();
break;
}
- case Instruction::BitCast:
- case Instruction::AddrSpaceCast:
- break;
default:
llvm_unreachable("Inconsistency in instructions promotable to vector");
@@ -659,16 +763,15 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
continue;
if (Use->getParent()->getParent() == &F) {
- unsigned Align = GV.getAlignment();
- if (Align == 0)
- Align = DL.getABITypeAlignment(GV.getValueType());
+ Align Alignment =
+ DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType());
// FIXME: Try to account for padding here. The padding is currently
// determined from the inverse order of uses in the function. I'm not
// sure if the use list order is in any way connected to this, so the
// total reported size is likely incorrect.
uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
- CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
+ CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alignment);
CurrentLocalMemUsage += AllocSize;
break;
}
@@ -722,6 +825,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
if (!I.isStaticAlloca() || I.isArrayAllocation())
return false;
+ const DataLayout &DL = Mod->getDataLayout();
IRBuilder<> Builder(&I);
// First try to replace the alloca with a vector
@@ -729,7 +833,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
- if (tryPromoteAllocaToVector(&I))
+ if (tryPromoteAllocaToVector(&I, DL, MaxVGPRs))
return true; // Promoted to vector.
if (DisablePromoteAllocaToLDS)
@@ -759,11 +863,8 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, ContainingFunction);
unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
- const DataLayout &DL = Mod->getDataLayout();
-
- unsigned Align = I.getAlignment();
- if (Align == 0)
- Align = DL.getABITypeAlignment(I.getAllocatedType());
+ Align Alignment =
+ DL.getValueOrABITypeAlignment(I.getAlign(), I.getAllocatedType());
// FIXME: This computed padding is likely wrong since it depends on inverse
// usage order.
@@ -771,7 +872,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
// FIXME: It is also possible that if we're allowed to use all of the memory
// could could end up using more than the maximum due to alignment padding.
- uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align);
+ uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
NewSize += AllocSize;
@@ -938,6 +1039,60 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
return true;
}
+bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
+ if (skipFunction(F) || DisablePromoteAllocaToVector)
+ return false;
+
+ const TargetMachine *TM;
+ if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
+ TM = &TPC->getTM<TargetMachine>();
+ else
+ return false;
+
+ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
+ if (!ST.isPromoteAllocaEnabled())
+ return false;
+
+ if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
+ const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+ MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
+ } else {
+ MaxVGPRs = 128;
+ }
+
+ bool Changed = false;
+ BasicBlock &EntryBB = *F.begin();
+
+ SmallVector<AllocaInst *, 16> Allocas;
+ for (Instruction &I : EntryBB) {
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
+ Allocas.push_back(AI);
+ }
+
+ for (AllocaInst *AI : Allocas) {
+ if (handleAlloca(*AI))
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+bool AMDGPUPromoteAllocaToVector::handleAlloca(AllocaInst &I) {
+ // Array allocations are probably not worth handling, since an allocation of
+ // the array type is the canonical form.
+ if (!I.isStaticAlloca() || I.isArrayAllocation())
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
+
+ Module *Mod = I.getParent()->getParent()->getParent();
+ return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs);
+}
+
FunctionPass *llvm::createAMDGPUPromoteAlloca() {
return new AMDGPUPromoteAlloca();
}
+
+FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
+ return new AMDGPUPromoteAllocaToVector();
+}