diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 211 |
1 files changed, 89 insertions, 122 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 99b7ffb33884..5a4426ba8113 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -334,86 +334,49 @@ static FixedVectorType *arrayTypeToVecType(ArrayType *ArrayTy) { ArrayTy->getNumElements()); } -static Value *stripBitcasts(Value *V) { - while (Instruction *I = dyn_cast<Instruction>(V)) { - if (I->getOpcode() != Instruction::BitCast) - break; - V = I->getOperand(0); - } - return V; -} - static Value * calculateVectorIndex(Value *Ptr, const std::map<GetElementPtrInst *, Value *> &GEPIdx) { - GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(stripBitcasts(Ptr)); + auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()); if (!GEP) - return nullptr; + return ConstantInt::getNullValue(Type::getInt32Ty(Ptr->getContext())); auto I = GEPIdx.find(GEP); - return I == GEPIdx.end() ? nullptr : I->second; + assert(I != GEPIdx.end() && "Must have entry for GEP!"); + return I->second; } -static Value* GEPToVectorIndex(GetElementPtrInst *GEP) { - // FIXME we only support simple cases - if (GEP->getNumOperands() != 3) +static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, + Type *VecElemTy, const DataLayout &DL) { + // TODO: Extracting a "multiple of X" from a GEP might be a useful generic + // helper. + unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType()); + MapVector<Value *, APInt> VarOffsets; + APInt ConstOffset(BW, 0); + if (GEP->getPointerOperand()->stripPointerCasts() != Alloca || + !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset)) return nullptr; - ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1)); - if (!I0 || !I0->isZero()) + unsigned VecElemSize = DL.getTypeAllocSize(VecElemTy); + if (VarOffsets.size() > 1) return nullptr; - return GEP->getOperand(2); -} - -// Not an instruction handled below to turn into a vector. -// -// TODO: Check isTriviallyVectorizable for calls and handle other -// instructions. -static bool canVectorizeInst(Instruction *Inst, User *User, - const DataLayout &DL) { - switch (Inst->getOpcode()) { - case Instruction::Load: { - // Currently only handle the case where the Pointer Operand is a GEP. - // Also we could not vectorize volatile or atomic loads. - LoadInst *LI = cast<LoadInst>(Inst); - if (isa<AllocaInst>(User) && - LI->getPointerOperandType() == User->getType() && - isa<VectorType>(LI->getType())) - return true; - - Instruction *PtrInst = dyn_cast<Instruction>(LI->getPointerOperand()); - if (!PtrInst) - return false; - - return (PtrInst->getOpcode() == Instruction::GetElementPtr || - PtrInst->getOpcode() == Instruction::BitCast) && - LI->isSimple(); + if (VarOffsets.size() == 1) { + // Only handle cases where we don't need to insert extra arithmetic + // instructions. + const auto &VarOffset = VarOffsets.front(); + if (!ConstOffset.isZero() || VarOffset.second != VecElemSize) + return nullptr; + return VarOffset.first; } - case Instruction::BitCast: - return true; - case Instruction::Store: { - // Must be the stored pointer operand, not a stored value, plus - // since it should be canonical form, the User should be a GEP. - // Also we could not vectorize volatile or atomic stores. - StoreInst *SI = cast<StoreInst>(Inst); - if (isa<AllocaInst>(User) && - SI->getPointerOperandType() == User->getType() && - isa<VectorType>(SI->getValueOperand()->getType())) - return true; - Instruction *UserInst = dyn_cast<Instruction>(User); - if (!UserInst) - return false; + APInt Quot; + uint64_t Rem; + APInt::udivrem(ConstOffset, VecElemSize, Quot, Rem); + if (Rem != 0) + return nullptr; - return (SI->getPointerOperand() == User) && - (UserInst->getOpcode() == Instruction::GetElementPtr || - UserInst->getOpcode() == Instruction::BitCast) && - SI->isSimple(); - } - default: - return false; - } + return ConstantInt::get(GEP->getContext(), Quot); } static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL, @@ -455,73 +418,87 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL, } std::map<GetElementPtrInst*, Value*> GEPVectorIdx; - std::vector<Value *> WorkList; - SmallVector<User *, 8> Users(Alloca->users()); - SmallVector<User *, 8> UseUsers(Users.size(), Alloca); + SmallVector<Instruction *> WorkList; + SmallVector<Use *, 8> Uses; + for (Use &U : Alloca->uses()) + Uses.push_back(&U); + Type *VecEltTy = VectorTy->getElementType(); - while (!Users.empty()) { - User *AllocaUser = Users.pop_back_val(); - User *UseUser = UseUsers.pop_back_val(); - Instruction *Inst = dyn_cast<Instruction>(AllocaUser); + while (!Uses.empty()) { + Use *U = Uses.pop_back_val(); + Instruction *Inst = dyn_cast<Instruction>(U->getUser()); - GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser); - if (!GEP) { - if (!canVectorizeInst(Inst, UseUser, DL)) + if (Value *Ptr = getLoadStorePointerOperand(Inst)) { + // This is a store of the pointer, not to the pointer. + if (isa<StoreInst>(Inst) && + U->getOperandNo() != StoreInst::getPointerOperandIndex()) return false; - if (Inst->getOpcode() == Instruction::BitCast) { - Type *FromTy = Inst->getOperand(0)->getType()->getPointerElementType(); - Type *ToTy = Inst->getType()->getPointerElementType(); - if (FromTy->isAggregateType() || ToTy->isAggregateType() || - DL.getTypeSizeInBits(FromTy) != DL.getTypeSizeInBits(ToTy)) - continue; - - for (User *CastUser : Inst->users()) { - if (isAssumeLikeIntrinsic(cast<Instruction>(CastUser))) - continue; - Users.push_back(CastUser); - UseUsers.push_back(Inst); - } + Type *AccessTy = getLoadStoreType(Inst); + Ptr = Ptr->stripPointerCasts(); + // Alloca already accessed as vector, leave alone. + if (Ptr == Alloca && DL.getTypeStoreSize(Alloca->getAllocatedType()) == + DL.getTypeStoreSize(AccessTy)) continue; - } - WorkList.push_back(AllocaUser); + // Check that this is a simple access of a vector element. + bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple() + : cast<StoreInst>(Inst)->isSimple(); + if (!IsSimple || + !CastInst::isBitOrNoopPointerCastable(VecEltTy, AccessTy, DL)) + return false; + + WorkList.push_back(Inst); continue; } - Value *Index = GEPToVectorIndex(GEP); + if (isa<BitCastInst>(Inst)) { + // Look through bitcasts. + for (Use &U : Inst->uses()) + Uses.push_back(&U); + continue; + } - // If we can't compute a vector index from this GEP, then we can't - // promote this alloca to vector. - if (!Index) { - LLVM_DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP - << '\n'); - return false; + if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) { + // If we can't compute a vector index from this GEP, then we can't + // promote this alloca to vector. + Value *Index = GEPToVectorIndex(GEP, Alloca, VecEltTy, DL); + if (!Index) { + LLVM_DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP + << '\n'); + return false; + } + + GEPVectorIdx[GEP] = Index; + for (Use &U : Inst->uses()) + Uses.push_back(&U); + continue; } - GEPVectorIdx[GEP] = Index; - Users.append(GEP->user_begin(), GEP->user_end()); - UseUsers.append(GEP->getNumUses(), GEP); + // Ignore assume-like intrinsics and comparisons used in assumes. + if (isAssumeLikeIntrinsic(Inst)) + continue; + + if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) { + return isAssumeLikeIntrinsic(cast<Instruction>(U)); + })) + continue; + + // Unknown user. + return false; } LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> " << *VectorTy << '\n'); - for (Value *V : WorkList) { - Instruction *Inst = cast<Instruction>(V); + for (Instruction *Inst : WorkList) { IRBuilder<> Builder(Inst); switch (Inst->getOpcode()) { case Instruction::Load: { - if (Inst->getType() == AllocaTy || Inst->getType()->isVectorTy()) - break; - Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand(); Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); - if (!Index) - break; - - Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS); + Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace()); Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); Value *VecValue = Builder.CreateLoad(VectorTy, BitCast); Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index); @@ -533,16 +510,9 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL, } case Instruction::Store: { StoreInst *SI = cast<StoreInst>(Inst); - if (SI->getValueOperand()->getType() == AllocaTy || - SI->getValueOperand()->getType()->isVectorTy()) - break; - Value *Ptr = SI->getPointerOperand(); Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); - if (!Index) - break; - - Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS); + Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace()); Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); Value *VecValue = Builder.CreateLoad(VectorTy, BitCast); Value *Elt = SI->getValueOperand(); @@ -808,10 +778,7 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { // // FIXME: We should really do something to fix the addresses to a more optimal // value instead - llvm::sort(AllocatedSizes, [](std::pair<uint64_t, Align> LHS, - std::pair<uint64_t, Align> RHS) { - return LHS.second < RHS.second; - }); + llvm::sort(AllocatedSizes, llvm::less_second()); // Check how much local memory is being used by global objects CurrentLocalMemUsage = 0; @@ -917,7 +884,7 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) { // usage order. // // FIXME: It is also possible that if we're allowed to use all of the memory - // could could end up using more than the maximum due to alignment padding. + // could end up using more than the maximum due to alignment padding. uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment); uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy); |
