diff options
Diffstat (limited to 'lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp')
-rw-r--r-- | lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 213 |
1 files changed, 114 insertions, 99 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 85184b363905e..07f92918a43fe 100644 --- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -97,6 +97,9 @@ private: Instruction *UseInst, int OpIdx0, int OpIdx1) const; + /// Check whether we have enough local memory for promotion. + bool hasSufficientLocalMem(const Function &F); + public: static char ID; @@ -107,7 +110,7 @@ public: StringRef getPassName() const override { return "AMDGPU Promote Alloca"; } - void handleAlloca(AllocaInst &I); + bool handleAlloca(AllocaInst &I, bool SufficientLDS); void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); @@ -147,105 +150,21 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F); if (!ST.isPromoteAllocaEnabled()) return false; - AS = AMDGPU::getAMDGPUAS(*F.getParent()); - - FunctionType *FTy = F.getFunctionType(); - - // If the function has any arguments in the local address space, then it's - // possible these arguments require the entire local memory space, so - // we cannot use local memory in the pass. - for (Type *ParamTy : FTy->params()) { - PointerType *PtrTy = dyn_cast<PointerType>(ParamTy); - if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) { - LocalMemLimit = 0; - DEBUG(dbgs() << "Function has local memory argument. Promoting to " - "local memory disabled.\n"); - return false; - } - } - - LocalMemLimit = ST.getLocalMemorySize(); - if (LocalMemLimit == 0) - return false; - - const DataLayout &DL = Mod->getDataLayout(); - - // Check how much local memory is being used by global objects - CurrentLocalMemUsage = 0; - for (GlobalVariable &GV : Mod->globals()) { - if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS) - continue; - - for (const User *U : GV.users()) { - const Instruction *Use = dyn_cast<Instruction>(U); - if (!Use) - continue; - - if (Use->getParent()->getParent() == &F) { - unsigned Align = GV.getAlignment(); - if (Align == 0) - Align = DL.getABITypeAlignment(GV.getValueType()); - // FIXME: Try to account for padding here. The padding is currently - // determined from the inverse order of uses in the function. I'm not - // sure if the use list order is in any way connected to this, so the - // total reported size is likely incorrect. - uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType()); - CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align); - CurrentLocalMemUsage += AllocSize; - break; - } - } - } - - unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage, - F); - - // Restrict local memory usage so that we don't drastically reduce occupancy, - // unless it is already significantly reduced. - - // TODO: Have some sort of hint or other heuristics to guess occupancy based - // on other factors.. - unsigned OccupancyHint = ST.getWavesPerEU(F).second; - if (OccupancyHint == 0) - OccupancyHint = 7; - - // Clamp to max value. - OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU()); - - // Check the hint but ignore it if it's obviously wrong from the existing LDS - // usage. - MaxOccupancy = std::min(OccupancyHint, MaxOccupancy); - - - // Round up to the next tier of usage. - unsigned MaxSizeWithWaveCount - = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F); - - // Program is possibly broken by using more local mem than available. - if (CurrentLocalMemUsage > MaxSizeWithWaveCount) - return false; - - LocalMemLimit = MaxSizeWithWaveCount; - - DEBUG( - dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n" - << " Rounding size to " << MaxSizeWithWaveCount - << " with a maximum occupancy of " << MaxOccupancy << '\n' - << " and " << (LocalMemLimit - CurrentLocalMemUsage) - << " available for promotion\n" - ); + AS = AMDGPU::getAMDGPUAS(*F.getParent()); + bool SufficientLDS = hasSufficientLocalMem(F); + bool Changed = false; BasicBlock &EntryBB = *F.begin(); for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) { AllocaInst *AI = dyn_cast<AllocaInst>(I); ++I; if (AI) - handleAlloca(*AI); + Changed |= handleAlloca(*AI, SufficientLDS); } - return true; + return Changed; } std::pair<Value *, Value *> @@ -661,12 +580,105 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes( return true; } +bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) { + + FunctionType *FTy = F.getFunctionType(); + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F); + + // If the function has any arguments in the local address space, then it's + // possible these arguments require the entire local memory space, so + // we cannot use local memory in the pass. + for (Type *ParamTy : FTy->params()) { + PointerType *PtrTy = dyn_cast<PointerType>(ParamTy); + if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) { + LocalMemLimit = 0; + DEBUG(dbgs() << "Function has local memory argument. Promoting to " + "local memory disabled.\n"); + return false; + } + } + + LocalMemLimit = ST.getLocalMemorySize(); + if (LocalMemLimit == 0) + return false; + + const DataLayout &DL = Mod->getDataLayout(); + + // Check how much local memory is being used by global objects + CurrentLocalMemUsage = 0; + for (GlobalVariable &GV : Mod->globals()) { + if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS) + continue; + + for (const User *U : GV.users()) { + const Instruction *Use = dyn_cast<Instruction>(U); + if (!Use) + continue; + + if (Use->getParent()->getParent() == &F) { + unsigned Align = GV.getAlignment(); + if (Align == 0) + Align = DL.getABITypeAlignment(GV.getValueType()); + + // FIXME: Try to account for padding here. The padding is currently + // determined from the inverse order of uses in the function. I'm not + // sure if the use list order is in any way connected to this, so the + // total reported size is likely incorrect. + uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType()); + CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align); + CurrentLocalMemUsage += AllocSize; + break; + } + } + } + + unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage, + F); + + // Restrict local memory usage so that we don't drastically reduce occupancy, + // unless it is already significantly reduced. + + // TODO: Have some sort of hint or other heuristics to guess occupancy based + // on other factors.. + unsigned OccupancyHint = ST.getWavesPerEU(F).second; + if (OccupancyHint == 0) + OccupancyHint = 7; + + // Clamp to max value. + OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU()); + + // Check the hint but ignore it if it's obviously wrong from the existing LDS + // usage. + MaxOccupancy = std::min(OccupancyHint, MaxOccupancy); + + + // Round up to the next tier of usage. + unsigned MaxSizeWithWaveCount + = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F); + + // Program is possibly broken by using more local mem than available. + if (CurrentLocalMemUsage > MaxSizeWithWaveCount) + return false; + + LocalMemLimit = MaxSizeWithWaveCount; + + DEBUG( + dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n" + << " Rounding size to " << MaxSizeWithWaveCount + << " with a maximum occupancy of " << MaxOccupancy << '\n' + << " and " << (LocalMemLimit - CurrentLocalMemUsage) + << " available for promotion\n" + ); + + return true; +} + // FIXME: Should try to pick the most likely to be profitable allocas first. -void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { +bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { // Array allocations are probably not worth handling, since an allocation of // the array type is the canonical form. if (!I.isStaticAlloca() || I.isArrayAllocation()) - return; + return false; IRBuilder<> Builder(&I); @@ -675,10 +687,8 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { DEBUG(dbgs() << "Trying to promote " << I << '\n'); - if (tryPromoteAllocaToVector(&I, AS)) { - DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); - return; - } + if (tryPromoteAllocaToVector(&I, AS)) + return true; // Promoted to vector. const Function &ContainingFunction = *I.getParent()->getParent(); CallingConv::ID CC = ContainingFunction.getCallingConv(); @@ -692,9 +702,13 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { break; default: DEBUG(dbgs() << " promote alloca to LDS not supported with calling convention.\n"); - return; + return false; } + // Not likely to have sufficient local memory for promotion. + if (!SufficientLDS) + return false; + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction); unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second; @@ -718,7 +732,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { if (NewSize > LocalMemLimit) { DEBUG(dbgs() << " " << AllocSize << " bytes of local memory not available to promote\n"); - return; + return false; } CurrentLocalMemUsage = NewSize; @@ -727,7 +741,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { if (!collectUsesWithPtrTypes(&I, &I, WorkList)) { DEBUG(dbgs() << " Do not know how to convert all uses\n"); - return; + return false; } DEBUG(dbgs() << "Promoting alloca to local memory\n"); @@ -873,6 +887,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { llvm_unreachable("Don't know how to promote alloca intrinsic use."); } } + return true; } FunctionPass *llvm::createAMDGPUPromoteAlloca() { |