diff options
Diffstat (limited to 'llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp')
-rw-r--r-- | llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp | 209 |
1 files changed, 146 insertions, 63 deletions
diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp index ab65f56d088f..687e14d6d7d2 100644 --- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -21,7 +21,6 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/CFG.h" @@ -32,6 +31,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include "llvm/Transforms/Utils/ValueMapper.h" using namespace llvm; @@ -61,10 +61,10 @@ namespace { /// Loop prefetch implementation class. class LoopDataPrefetch { public: - LoopDataPrefetch(AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, - const TargetTransformInfo *TTI, + LoopDataPrefetch(AssumptionCache *AC, DominatorTree *DT, LoopInfo *LI, + ScalarEvolution *SE, const TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE) - : AC(AC), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {} + : AC(AC), DT(DT), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {} bool run(); @@ -73,12 +73,16 @@ private: /// Check if the stride of the accesses is large enough to /// warrant a prefetch. - bool isStrideLargeEnough(const SCEVAddRecExpr *AR); + bool isStrideLargeEnough(const SCEVAddRecExpr *AR, unsigned TargetMinStride); - unsigned getMinPrefetchStride() { + unsigned getMinPrefetchStride(unsigned NumMemAccesses, + unsigned NumStridedMemAccesses, + unsigned NumPrefetches, + bool HasCall) { if (MinPrefetchStride.getNumOccurrences() > 0) return MinPrefetchStride; - return TTI->getMinPrefetchStride(); + return TTI->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses, + NumPrefetches, HasCall); } unsigned getPrefetchDistance() { @@ -93,7 +97,14 @@ private: return TTI->getMaxPrefetchIterationsAhead(); } + bool doPrefetchWrites() { + if (PrefetchWrites.getNumOccurrences() > 0) + return PrefetchWrites; + return TTI->enableWritePrefetching(); + } + AssumptionCache *AC; + DominatorTree *DT; LoopInfo *LI; ScalarEvolution *SE; const TargetTransformInfo *TTI; @@ -110,6 +121,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); @@ -138,8 +150,8 @@ FunctionPass *llvm::createLoopDataPrefetchPass() { return new LoopDataPrefetchLegacyPass(); } -bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR) { - unsigned TargetMinStride = getMinPrefetchStride(); +bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR, + unsigned TargetMinStride) { // No need to check if any stride goes. if (TargetMinStride <= 1) return true; @@ -156,6 +168,7 @@ bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR) { PreservedAnalyses LoopDataPrefetchPass::run(Function &F, FunctionAnalysisManager &AM) { + DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F); LoopInfo *LI = &AM.getResult<LoopAnalysis>(F); ScalarEvolution *SE = &AM.getResult<ScalarEvolutionAnalysis>(F); AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F); @@ -163,7 +176,7 @@ PreservedAnalyses LoopDataPrefetchPass::run(Function &F, &AM.getResult<OptimizationRemarkEmitterAnalysis>(F); const TargetTransformInfo *TTI = &AM.getResult<TargetIRAnalysis>(F); - LoopDataPrefetch LDP(AC, LI, SE, TTI, ORE); + LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE); bool Changed = LDP.run(); if (Changed) { @@ -180,6 +193,7 @@ bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) { if (skipFunction(F)) return false; + DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); AssumptionCache *AC = @@ -189,7 +203,7 @@ bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) { const TargetTransformInfo *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - LoopDataPrefetch LDP(AC, LI, SE, TTI, ORE); + LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE); return LDP.run(); } @@ -210,6 +224,49 @@ bool LoopDataPrefetch::run() { return MadeChange; } +/// A record for a potential prefetch made during the initial scan of the +/// loop. This is used to let a single prefetch target multiple memory accesses. +struct Prefetch { + /// The address formula for this prefetch as returned by ScalarEvolution. + const SCEVAddRecExpr *LSCEVAddRec; + /// The point of insertion for the prefetch instruction. + Instruction *InsertPt; + /// True if targeting a write memory access. + bool Writes; + /// The (first seen) prefetched instruction. + Instruction *MemI; + + /// Constructor to create a new Prefetch for \p I. + Prefetch(const SCEVAddRecExpr *L, Instruction *I) + : LSCEVAddRec(L), InsertPt(nullptr), Writes(false), MemI(nullptr) { + addInstruction(I); + }; + + /// Add the instruction \param I to this prefetch. If it's not the first + /// one, 'InsertPt' and 'Writes' will be updated as required. + /// \param PtrDiff the known constant address difference to the first added + /// instruction. + void addInstruction(Instruction *I, DominatorTree *DT = nullptr, + int64_t PtrDiff = 0) { + if (!InsertPt) { + MemI = I; + InsertPt = I; + Writes = isa<StoreInst>(I); + } else { + BasicBlock *PrefBB = InsertPt->getParent(); + BasicBlock *InsBB = I->getParent(); + if (PrefBB != InsBB) { + BasicBlock *DomBB = DT->findNearestCommonDominator(PrefBB, InsBB); + if (DomBB != PrefBB) + InsertPt = DomBB->getTerminator(); + } + + if (isa<StoreInst>(I) && PtrDiff == 0) + Writes = true; + } + } +}; + bool LoopDataPrefetch::runOnLoop(Loop *L) { bool MadeChange = false; @@ -222,15 +279,22 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { // Calculate the number of iterations ahead to prefetch CodeMetrics Metrics; + bool HasCall = false; for (const auto BB : L->blocks()) { // If the loop already has prefetches, then assume that the user knows // what they are doing and don't add any more. - for (auto &I : *BB) - if (CallInst *CI = dyn_cast<CallInst>(&I)) - if (Function *F = CI->getCalledFunction()) + for (auto &I : *BB) { + if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) { + if (const Function *F = cast<CallBase>(I).getCalledFunction()) { if (F->getIntrinsicID() == Intrinsic::prefetch) return MadeChange; - + if (TTI->isLoweredToCall(F)) + HasCall = true; + } else { // indirect call. + HasCall = true; + } + } + } Metrics.analyzeBasicBlock(BB, *TTI, EphValues); } unsigned LoopSize = Metrics.NumInsts; @@ -244,12 +308,14 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { if (ItersAhead > getMaxPrefetchIterationsAhead()) return MadeChange; - LLVM_DEBUG(dbgs() << "Prefetching " << ItersAhead - << " iterations ahead (loop size: " << LoopSize << ") in " - << L->getHeader()->getParent()->getName() << ": " << *L); + unsigned ConstantMaxTripCount = SE->getSmallConstantMaxTripCount(L); + if (ConstantMaxTripCount && ConstantMaxTripCount < ItersAhead + 1) + return MadeChange; - SmallVector<std::pair<Instruction *, const SCEVAddRecExpr *>, 16> PrefLoads; - for (const auto BB : L->blocks()) { + unsigned NumMemAccesses = 0; + unsigned NumStridedMemAccesses = 0; + SmallVector<Prefetch, 16> Prefetches; + for (const auto BB : L->blocks()) for (auto &I : *BB) { Value *PtrValue; Instruction *MemI; @@ -258,7 +324,7 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { MemI = LMemI; PtrValue = LMemI->getPointerOperand(); } else if (StoreInst *SMemI = dyn_cast<StoreInst>(&I)) { - if (!PrefetchWrites) continue; + if (!doPrefetchWrites()) continue; MemI = SMemI; PtrValue = SMemI->getPointerOperand(); } else continue; @@ -266,7 +332,7 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace(); if (PtrAddrSpace) continue; - + NumMemAccesses++; if (L->isLoopInvariant(PtrValue)) continue; @@ -274,62 +340,79 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); if (!LSCEVAddRec) continue; + NumStridedMemAccesses++; - // Check if the stride of the accesses is large enough to warrant a - // prefetch. - if (!isStrideLargeEnough(LSCEVAddRec)) - continue; - - // We don't want to double prefetch individual cache lines. If this load - // is known to be within one cache line of some other load that has - // already been prefetched, then don't prefetch this one as well. + // We don't want to double prefetch individual cache lines. If this + // access is known to be within one cache line of some other one that + // has already been prefetched, then don't prefetch this one as well. bool DupPref = false; - for (const auto &PrefLoad : PrefLoads) { - const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, PrefLoad.second); + for (auto &Pref : Prefetches) { + const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, Pref.LSCEVAddRec); if (const SCEVConstant *ConstPtrDiff = dyn_cast<SCEVConstant>(PtrDiff)) { int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue()); if (PD < (int64_t) TTI->getCacheLineSize()) { + Pref.addInstruction(MemI, DT, PD); DupPref = true; break; } } } - if (DupPref) - continue; + if (!DupPref) + Prefetches.push_back(Prefetch(LSCEVAddRec, MemI)); + } - const SCEV *NextLSCEV = SE->getAddExpr(LSCEVAddRec, SE->getMulExpr( - SE->getConstant(LSCEVAddRec->getType(), ItersAhead), - LSCEVAddRec->getStepRecurrence(*SE))); - if (!isSafeToExpand(NextLSCEV, *SE)) - continue; + unsigned TargetMinStride = + getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses, + Prefetches.size(), HasCall); - PrefLoads.push_back(std::make_pair(MemI, LSCEVAddRec)); - - Type *I8Ptr = Type::getInt8PtrTy(BB->getContext(), PtrAddrSpace); - SCEVExpander SCEVE(*SE, I.getModule()->getDataLayout(), "prefaddr"); - Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, MemI); - - IRBuilder<> Builder(MemI); - Module *M = BB->getParent()->getParent(); - Type *I32 = Type::getInt32Ty(BB->getContext()); - Function *PrefetchFunc = Intrinsic::getDeclaration( - M, Intrinsic::prefetch, PrefPtrValue->getType()); - Builder.CreateCall( - PrefetchFunc, - {PrefPtrValue, - ConstantInt::get(I32, MemI->mayReadFromMemory() ? 0 : 1), - ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)}); - ++NumPrefetches; - LLVM_DEBUG(dbgs() << " Access: " << *PtrValue << ", SCEV: " << *LSCEV - << "\n"); - ORE->emit([&]() { - return OptimizationRemark(DEBUG_TYPE, "Prefetched", MemI) - << "prefetched memory access"; + LLVM_DEBUG(dbgs() << "Prefetching " << ItersAhead + << " iterations ahead (loop size: " << LoopSize << ") in " + << L->getHeader()->getParent()->getName() << ": " << *L); + LLVM_DEBUG(dbgs() << "Loop has: " + << NumMemAccesses << " memory accesses, " + << NumStridedMemAccesses << " strided memory accesses, " + << Prefetches.size() << " potential prefetch(es), " + << "a minimum stride of " << TargetMinStride << ", " + << (HasCall ? "calls" : "no calls") << ".\n"); + + for (auto &P : Prefetches) { + // Check if the stride of the accesses is large enough to warrant a + // prefetch. + if (!isStrideLargeEnough(P.LSCEVAddRec, TargetMinStride)) + continue; + + const SCEV *NextLSCEV = SE->getAddExpr(P.LSCEVAddRec, SE->getMulExpr( + SE->getConstant(P.LSCEVAddRec->getType(), ItersAhead), + P.LSCEVAddRec->getStepRecurrence(*SE))); + if (!isSafeToExpand(NextLSCEV, *SE)) + continue; + + BasicBlock *BB = P.InsertPt->getParent(); + Type *I8Ptr = Type::getInt8PtrTy(BB->getContext(), 0/*PtrAddrSpace*/); + SCEVExpander SCEVE(*SE, BB->getModule()->getDataLayout(), "prefaddr"); + Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, P.InsertPt); + + IRBuilder<> Builder(P.InsertPt); + Module *M = BB->getParent()->getParent(); + Type *I32 = Type::getInt32Ty(BB->getContext()); + Function *PrefetchFunc = Intrinsic::getDeclaration( + M, Intrinsic::prefetch, PrefPtrValue->getType()); + Builder.CreateCall( + PrefetchFunc, + {PrefPtrValue, + ConstantInt::get(I32, P.Writes), + ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)}); + ++NumPrefetches; + LLVM_DEBUG(dbgs() << " Access: " + << *P.MemI->getOperand(isa<LoadInst>(P.MemI) ? 0 : 1) + << ", SCEV: " << *P.LSCEVAddRec << "\n"); + ORE->emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "Prefetched", P.MemI) + << "prefetched memory access"; }); - MadeChange = true; - } + MadeChange = true; } return MadeChange; |