diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 255 |
1 files changed, 205 insertions, 50 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 14958a180ce3..727f71b35049 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -76,6 +76,11 @@ static cl::opt<bool> DisablePromoteAllocaToLDS( cl::desc("Disable promote alloca to LDS"), cl::init(false)); +static cl::opt<unsigned> PromoteAllocaToVectorLimit( + "amdgpu-promote-alloca-to-vector-limit", + cl::desc("Maximum byte size to consider promote alloca to vector"), + cl::init(0)); + // FIXME: This can create globals so should be a module pass. class AMDGPUPromoteAlloca : public FunctionPass { private: @@ -86,6 +91,7 @@ private: // FIXME: This should be per-kernel. uint32_t LocalMemLimit = 0; uint32_t CurrentLocalMemUsage = 0; + unsigned MaxVGPRs; bool IsAMDGCN = false; bool IsAMDHSA = false; @@ -128,14 +134,42 @@ public: } }; +class AMDGPUPromoteAllocaToVector : public FunctionPass { +private: + unsigned MaxVGPRs; + +public: + static char ID; + + AMDGPUPromoteAllocaToVector() : FunctionPass(ID) {} + + bool runOnFunction(Function &F) override; + + StringRef getPassName() const override { + return "AMDGPU Promote Alloca to vector"; + } + + bool handleAlloca(AllocaInst &I); + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + FunctionPass::getAnalysisUsage(AU); + } +}; + } // end anonymous namespace char AMDGPUPromoteAlloca::ID = 0; +char AMDGPUPromoteAllocaToVector::ID = 0; INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE, "AMDGPU promote alloca to vector or LDS", false, false) +INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector", + "AMDGPU promote alloca to vector", false, false) + char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID; +char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID; bool AMDGPUPromoteAlloca::doInitialization(Module &M) { Mod = &M; @@ -161,6 +195,13 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { if (!ST.isPromoteAllocaEnabled()) return false; + if (IsAMDGCN) { + const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); + MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first); + } else { + MaxVGPRs = 128; + } + bool SufficientLDS = hasSufficientLocalMem(F); bool Changed = false; BasicBlock &EntryBB = *F.begin(); @@ -251,10 +292,10 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { // 32-bit and extract sequence is already present, and it is probably easier // to CSE this. The loads should be mergable later anyway. Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 1); - LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, 4); + LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4)); Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 2); - LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, 4); + LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4)); MDNode *MD = MDNode::get(Mod->getContext(), None); LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD); @@ -297,15 +338,26 @@ Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) { return CI; } -static VectorType *arrayTypeToVecType(ArrayType *ArrayTy) { - return VectorType::get(ArrayTy->getElementType(), - ArrayTy->getNumElements()); +static FixedVectorType *arrayTypeToVecType(ArrayType *ArrayTy) { + return FixedVectorType::get(ArrayTy->getElementType(), + ArrayTy->getNumElements()); +} + +static Value *stripBitcasts(Value *V) { + while (Instruction *I = dyn_cast<Instruction>(V)) { + if (I->getOpcode() != Instruction::BitCast) + break; + V = I->getOperand(0); + } + return V; } static Value * calculateVectorIndex(Value *Ptr, const std::map<GetElementPtrInst *, Value *> &GEPIdx) { - GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr); + GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(stripBitcasts(Ptr)); + if (!GEP) + return nullptr; auto I = GEPIdx.find(GEP); return I == GEPIdx.end() ? nullptr : I->second; @@ -327,7 +379,8 @@ static Value* GEPToVectorIndex(GetElementPtrInst *GEP) { // // TODO: Check isTriviallyVectorizable for calls and handle other // instructions. -static bool canVectorizeInst(Instruction *Inst, User *User) { +static bool canVectorizeInst(Instruction *Inst, User *User, + const DataLayout &DL) { switch (Inst->getOpcode()) { case Instruction::Load: { // Currently only handle the case where the Pointer Operand is a GEP. @@ -337,7 +390,14 @@ static bool canVectorizeInst(Instruction *Inst, User *User) { LI->getPointerOperandType() == User->getType() && isa<VectorType>(LI->getType())) return true; - return isa<GetElementPtrInst>(LI->getPointerOperand()) && LI->isSimple(); + + Instruction *PtrInst = dyn_cast<Instruction>(LI->getPointerOperand()); + if (!PtrInst) + return false; + + return (PtrInst->getOpcode() == Instruction::GetElementPtr || + PtrInst->getOpcode() == Instruction::BitCast) && + LI->isSimple(); } case Instruction::BitCast: return true; @@ -350,22 +410,46 @@ static bool canVectorizeInst(Instruction *Inst, User *User) { SI->getPointerOperandType() == User->getType() && isa<VectorType>(SI->getValueOperand()->getType())) return true; - return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && SI->isSimple(); + + Instruction *UserInst = dyn_cast<Instruction>(User); + if (!UserInst) + return false; + + return (SI->getPointerOperand() == User) && + (UserInst->getOpcode() == Instruction::GetElementPtr || + UserInst->getOpcode() == Instruction::BitCast) && + SI->isSimple(); } default: return false; } } -static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { +static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL, + unsigned MaxVGPRs) { if (DisablePromoteAllocaToVector) { LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n"); return false; } - Type *AT = Alloca->getAllocatedType(); - SequentialType *AllocaTy = dyn_cast<SequentialType>(AT); + Type *AllocaTy = Alloca->getAllocatedType(); + auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy); + if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) { + if (VectorType::isValidElementType(ArrayTy->getElementType()) && + ArrayTy->getNumElements() > 0) + VectorTy = arrayTypeToVecType(ArrayTy); + } + + // Use up to 1/4 of available register budget for vectorization. + unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8 + : (MaxVGPRs * 32); + + if (DL.getTypeSizeInBits(AllocaTy) * 4 > Limit) { + LLVM_DEBUG(dbgs() << " Alloca too big for vectorization with " + << MaxVGPRs << " registers available\n"); + return false; + } LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n"); @@ -373,22 +457,44 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { // are just being conservative for now. // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these // could also be promoted but we don't currently handle this case - if (!AllocaTy || - AllocaTy->getNumElements() > 16 || - AllocaTy->getNumElements() < 2 || - !VectorType::isValidElementType(AllocaTy->getElementType())) { + if (!VectorTy || VectorTy->getNumElements() > 16 || + VectorTy->getNumElements() < 2) { LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n"); return false; } std::map<GetElementPtrInst*, Value*> GEPVectorIdx; - std::vector<Value*> WorkList; - for (User *AllocaUser : Alloca->users()) { + std::vector<Value *> WorkList; + SmallVector<User *, 8> Users(Alloca->users()); + SmallVector<User *, 8> UseUsers(Users.size(), Alloca); + Type *VecEltTy = VectorTy->getElementType(); + while (!Users.empty()) { + User *AllocaUser = Users.pop_back_val(); + User *UseUser = UseUsers.pop_back_val(); + Instruction *Inst = dyn_cast<Instruction>(AllocaUser); + GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser); if (!GEP) { - if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca)) + if (!canVectorizeInst(Inst, UseUser, DL)) return false; + if (Inst->getOpcode() == Instruction::BitCast) { + Type *FromTy = Inst->getOperand(0)->getType()->getPointerElementType(); + Type *ToTy = Inst->getType()->getPointerElementType(); + if (FromTy->isAggregateType() || ToTy->isAggregateType() || + DL.getTypeSizeInBits(FromTy) != DL.getTypeSizeInBits(ToTy)) + continue; + + for (User *CastUser : Inst->users()) { + if (isAssumeLikeIntrinsic(cast<Instruction>(CastUser))) + continue; + Users.push_back(CastUser); + UseUsers.push_back(Inst); + } + + continue; + } + WorkList.push_back(AllocaUser); continue; } @@ -404,18 +510,10 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { } GEPVectorIdx[GEP] = Index; - for (User *GEPUser : AllocaUser->users()) { - if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser)) - return false; - - WorkList.push_back(GEPUser); - } + Users.append(GEP->user_begin(), GEP->user_end()); + UseUsers.append(GEP->getNumUses(), GEP); } - VectorType *VectorTy = dyn_cast<VectorType>(AllocaTy); - if (!VectorTy) - VectorTy = arrayTypeToVecType(cast<ArrayType>(AllocaTy)); - LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> " << *VectorTy << '\n'); @@ -424,40 +522,46 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { IRBuilder<> Builder(Inst); switch (Inst->getOpcode()) { case Instruction::Load: { - if (Inst->getType() == AT) + if (Inst->getType() == AllocaTy || Inst->getType()->isVectorTy()) break; - Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS); Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand(); Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); + if (!Index) + break; + Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS); Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); Value *VecValue = Builder.CreateLoad(VectorTy, BitCast); Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index); + if (Inst->getType() != VecEltTy) + ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType()); Inst->replaceAllUsesWith(ExtractElement); Inst->eraseFromParent(); break; } case Instruction::Store: { StoreInst *SI = cast<StoreInst>(Inst); - if (SI->getValueOperand()->getType() == AT) + if (SI->getValueOperand()->getType() == AllocaTy || + SI->getValueOperand()->getType()->isVectorTy()) break; - Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS); Value *Ptr = SI->getPointerOperand(); Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); + if (!Index) + break; + + Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS); Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); Value *VecValue = Builder.CreateLoad(VectorTy, BitCast); - Value *NewVecValue = Builder.CreateInsertElement(VecValue, - SI->getValueOperand(), - Index); + Value *Elt = SI->getValueOperand(); + if (Elt->getType() != VecEltTy) + Elt = Builder.CreateBitOrPointerCast(Elt, VecEltTy); + Value *NewVecValue = Builder.CreateInsertElement(VecValue, Elt, Index); Builder.CreateStore(NewVecValue, BitCast); Inst->eraseFromParent(); break; } - case Instruction::BitCast: - case Instruction::AddrSpaceCast: - break; default: llvm_unreachable("Inconsistency in instructions promotable to vector"); @@ -659,16 +763,15 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) { continue; if (Use->getParent()->getParent() == &F) { - unsigned Align = GV.getAlignment(); - if (Align == 0) - Align = DL.getABITypeAlignment(GV.getValueType()); + Align Alignment = + DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType()); // FIXME: Try to account for padding here. The padding is currently // determined from the inverse order of uses in the function. I'm not // sure if the use list order is in any way connected to this, so the // total reported size is likely incorrect. uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType()); - CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align); + CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alignment); CurrentLocalMemUsage += AllocSize; break; } @@ -722,6 +825,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { if (!I.isStaticAlloca() || I.isArrayAllocation()) return false; + const DataLayout &DL = Mod->getDataLayout(); IRBuilder<> Builder(&I); // First try to replace the alloca with a vector @@ -729,7 +833,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n'); - if (tryPromoteAllocaToVector(&I)) + if (tryPromoteAllocaToVector(&I, DL, MaxVGPRs)) return true; // Promoted to vector. if (DisablePromoteAllocaToLDS) @@ -759,11 +863,8 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, ContainingFunction); unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second; - const DataLayout &DL = Mod->getDataLayout(); - - unsigned Align = I.getAlignment(); - if (Align == 0) - Align = DL.getABITypeAlignment(I.getAllocatedType()); + Align Alignment = + DL.getValueOrABITypeAlignment(I.getAlign(), I.getAllocatedType()); // FIXME: This computed padding is likely wrong since it depends on inverse // usage order. @@ -771,7 +872,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { // FIXME: It is also possible that if we're allowed to use all of the memory // could could end up using more than the maximum due to alignment padding. - uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align); + uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment); uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy); NewSize += AllocSize; @@ -938,6 +1039,60 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { return true; } +bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) { + if (skipFunction(F) || DisablePromoteAllocaToVector) + return false; + + const TargetMachine *TM; + if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) + TM = &TPC->getTM<TargetMachine>(); + else + return false; + + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F); + if (!ST.isPromoteAllocaEnabled()) + return false; + + if (TM->getTargetTriple().getArch() == Triple::amdgcn) { + const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); + MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first); + } else { + MaxVGPRs = 128; + } + + bool Changed = false; + BasicBlock &EntryBB = *F.begin(); + + SmallVector<AllocaInst *, 16> Allocas; + for (Instruction &I : EntryBB) { + if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) + Allocas.push_back(AI); + } + + for (AllocaInst *AI : Allocas) { + if (handleAlloca(*AI)) + Changed = true; + } + + return Changed; +} + +bool AMDGPUPromoteAllocaToVector::handleAlloca(AllocaInst &I) { + // Array allocations are probably not worth handling, since an allocation of + // the array type is the canonical form. + if (!I.isStaticAlloca() || I.isArrayAllocation()) + return false; + + LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n'); + + Module *Mod = I.getParent()->getParent()->getParent(); + return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs); +} + FunctionPass *llvm::createAMDGPUPromoteAlloca() { return new AMDGPUPromoteAlloca(); } + +FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() { + return new AMDGPUPromoteAllocaToVector(); +} |