aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp122
1 files changed, 105 insertions, 17 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 5a4426ba8113..a7da4005e867 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -221,7 +221,7 @@ AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
ST.makeLIDRangeMetadata(LocalSizeY);
ST.makeLIDRangeMetadata(LocalSizeZ);
- return std::make_pair(LocalSizeY, LocalSizeZ);
+ return std::pair(LocalSizeY, LocalSizeZ);
}
// We must read the size out of the dispatch pointer.
@@ -282,7 +282,7 @@ AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 2);
LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
- MDNode *MD = MDNode::get(Mod->getContext(), None);
+ MDNode *MD = MDNode::get(Mod->getContext(), std::nullopt);
LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
ST.makeLIDRangeMetadata(LoadZU);
@@ -290,7 +290,7 @@ AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
// Extract y component. Upper half of LoadZU should be zero already.
Value *Y = Builder.CreateLShr(LoadXY, 16);
- return std::make_pair(Y, LoadZU);
+ return std::pair(Y, LoadZU);
}
Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
@@ -379,6 +379,11 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
return ConstantInt::get(GEP->getContext(), Quot);
}
+struct MemTransferInfo {
+ ConstantInt *SrcIndex = nullptr;
+ ConstantInt *DestIndex = nullptr;
+};
+
static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
unsigned MaxVGPRs) {
@@ -419,14 +424,18 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
SmallVector<Instruction *> WorkList;
+ SmallVector<Instruction *> DeferredInsts;
SmallVector<Use *, 8> Uses;
+ DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
+
for (Use &U : Alloca->uses())
Uses.push_back(&U);
Type *VecEltTy = VectorTy->getElementType();
+ unsigned ElementSize = DL.getTypeSizeInBits(VecEltTy) / 8;
while (!Uses.empty()) {
Use *U = Uses.pop_back_val();
- Instruction *Inst = dyn_cast<Instruction>(U->getUser());
+ Instruction *Inst = cast<Instruction>(U->getUser());
if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
// This is a store of the pointer, not to the pointer.
@@ -476,6 +485,47 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
continue;
}
+ if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Inst)) {
+ if (TransferInst->isVolatile())
+ return false;
+
+ ConstantInt *Len = dyn_cast<ConstantInt>(TransferInst->getLength());
+ if (!Len || !!(Len->getZExtValue() % ElementSize))
+ return false;
+
+ if (!TransferInfo.count(TransferInst)) {
+ DeferredInsts.push_back(Inst);
+ WorkList.push_back(Inst);
+ TransferInfo[TransferInst] = MemTransferInfo();
+ }
+
+ auto getPointerIndexOfAlloca = [&](Value *Ptr) -> ConstantInt * {
+ GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+ if (Ptr != Alloca && !GEPVectorIdx.count(GEP))
+ return nullptr;
+
+ return dyn_cast<ConstantInt>(calculateVectorIndex(Ptr, GEPVectorIdx));
+ };
+
+ unsigned OpNum = U->getOperandNo();
+ MemTransferInfo *TI = &TransferInfo[TransferInst];
+ if (OpNum == 0) {
+ Value *Dest = TransferInst->getDest();
+ ConstantInt *Index = getPointerIndexOfAlloca(Dest);
+ if (!Index)
+ return false;
+ TI->DestIndex = Index;
+ } else {
+ assert(OpNum == 1);
+ Value *Src = TransferInst->getSource();
+ ConstantInt *Index = getPointerIndexOfAlloca(Src);
+ if (!Index)
+ return false;
+ TI->SrcIndex = Index;
+ }
+ continue;
+ }
+
// Ignore assume-like intrinsics and comparisons used in assumes.
if (isAssumeLikeIntrinsic(Inst))
continue;
@@ -489,6 +539,16 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
return false;
}
+ while (!DeferredInsts.empty()) {
+ Instruction *Inst = DeferredInsts.pop_back_val();
+ MemTransferInst *TransferInst = cast<MemTransferInst>(Inst);
+ // TODO: Support the case if the pointers are from different alloca or
+ // from different address spaces.
+ MemTransferInfo &Info = TransferInfo[TransferInst];
+ if (!Info.SrcIndex || !Info.DestIndex)
+ return false;
+ }
+
LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "
<< *VectorTy << '\n');
@@ -500,7 +560,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace());
Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
- Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
+ Value *VecValue =
+ Builder.CreateAlignedLoad(VectorTy, BitCast, Alloca->getAlign());
Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
if (Inst->getType() != VecEltTy)
ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType());
@@ -514,15 +575,45 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace());
Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
- Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
+ Value *VecValue =
+ Builder.CreateAlignedLoad(VectorTy, BitCast, Alloca->getAlign());
Value *Elt = SI->getValueOperand();
if (Elt->getType() != VecEltTy)
Elt = Builder.CreateBitOrPointerCast(Elt, VecEltTy);
Value *NewVecValue = Builder.CreateInsertElement(VecValue, Elt, Index);
- Builder.CreateStore(NewVecValue, BitCast);
+ Builder.CreateAlignedStore(NewVecValue, BitCast, Alloca->getAlign());
Inst->eraseFromParent();
break;
}
+ case Instruction::Call: {
+ if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst)) {
+ ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
+ unsigned NumCopied = Length->getZExtValue() / ElementSize;
+ MemTransferInfo *TI = &TransferInfo[cast<MemTransferInst>(Inst)];
+ unsigned SrcBegin = TI->SrcIndex->getZExtValue();
+ unsigned DestBegin = TI->DestIndex->getZExtValue();
+
+ SmallVector<int> Mask;
+ for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) {
+ if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
+ Mask.push_back(SrcBegin++);
+ } else {
+ Mask.push_back(Idx);
+ }
+ }
+ Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace());
+ Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
+ Value *VecValue =
+ Builder.CreateAlignedLoad(VectorTy, BitCast, Alloca->getAlign());
+ Value *NewVecValue = Builder.CreateShuffleVector(VecValue, Mask);
+ Builder.CreateAlignedStore(NewVecValue, BitCast, Alloca->getAlign());
+
+ Inst->eraseFromParent();
+ } else {
+ llvm_unreachable("Unsupported call when promoting alloca to vector");
+ }
+ break;
+ }
default:
llvm_unreachable("Inconsistency in instructions promotable to vector");
@@ -707,7 +798,7 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
}
}
- LocalMemLimit = ST.getLocalMemorySize();
+ LocalMemLimit = ST.getAddressableLocalMemorySize();
if (LocalMemLimit == 0)
return false;
@@ -911,12 +1002,9 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
GlobalVariable *GV = new GlobalVariable(
- *Mod, GVTy, false, GlobalValue::InternalLinkage,
- UndefValue::get(GVTy),
- Twine(F->getName()) + Twine('.') + I.getName(),
- nullptr,
- GlobalVariable::NotThreadLocal,
- AMDGPUAS::LOCAL_ADDRESS);
+ *Mod, GVTy, false, GlobalValue::InternalLinkage, PoisonValue::get(GVTy),
+ Twine(F->getName()) + Twine('.') + I.getName(), nullptr,
+ GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
GV->setAlignment(I.getAlign());
@@ -1008,9 +1096,9 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
continue;
case Intrinsic::memset: {
MemSetInst *MemSet = cast<MemSetInst>(Intr);
- Builder.CreateMemSet(
- MemSet->getRawDest(), MemSet->getValue(), MemSet->getLength(),
- MaybeAlign(MemSet->getDestAlignment()), MemSet->isVolatile());
+ Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
+ MemSet->getLength(), MemSet->getDestAlign(),
+ MemSet->isVolatile());
Intr->eraseFromParent();
continue;
}