aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp')
-rw-r--r--llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp209
1 files changed, 146 insertions, 63 deletions
diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
index ab65f56d088f..687e14d6d7d2 100644
--- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -21,7 +21,6 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/CFG.h"
@@ -32,6 +31,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
using namespace llvm;
@@ -61,10 +61,10 @@ namespace {
/// Loop prefetch implementation class.
class LoopDataPrefetch {
public:
- LoopDataPrefetch(AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE,
- const TargetTransformInfo *TTI,
+ LoopDataPrefetch(AssumptionCache *AC, DominatorTree *DT, LoopInfo *LI,
+ ScalarEvolution *SE, const TargetTransformInfo *TTI,
OptimizationRemarkEmitter *ORE)
- : AC(AC), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {}
+ : AC(AC), DT(DT), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {}
bool run();
@@ -73,12 +73,16 @@ private:
/// Check if the stride of the accesses is large enough to
/// warrant a prefetch.
- bool isStrideLargeEnough(const SCEVAddRecExpr *AR);
+ bool isStrideLargeEnough(const SCEVAddRecExpr *AR, unsigned TargetMinStride);
- unsigned getMinPrefetchStride() {
+ unsigned getMinPrefetchStride(unsigned NumMemAccesses,
+ unsigned NumStridedMemAccesses,
+ unsigned NumPrefetches,
+ bool HasCall) {
if (MinPrefetchStride.getNumOccurrences() > 0)
return MinPrefetchStride;
- return TTI->getMinPrefetchStride();
+ return TTI->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
+ NumPrefetches, HasCall);
}
unsigned getPrefetchDistance() {
@@ -93,7 +97,14 @@ private:
return TTI->getMaxPrefetchIterationsAhead();
}
+ bool doPrefetchWrites() {
+ if (PrefetchWrites.getNumOccurrences() > 0)
+ return PrefetchWrites;
+ return TTI->enableWritePrefetching();
+ }
+
AssumptionCache *AC;
+ DominatorTree *DT;
LoopInfo *LI;
ScalarEvolution *SE;
const TargetTransformInfo *TTI;
@@ -110,6 +121,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();
@@ -138,8 +150,8 @@ FunctionPass *llvm::createLoopDataPrefetchPass() {
return new LoopDataPrefetchLegacyPass();
}
-bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR) {
- unsigned TargetMinStride = getMinPrefetchStride();
+bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR,
+ unsigned TargetMinStride) {
// No need to check if any stride goes.
if (TargetMinStride <= 1)
return true;
@@ -156,6 +168,7 @@ bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR) {
PreservedAnalyses LoopDataPrefetchPass::run(Function &F,
FunctionAnalysisManager &AM) {
+ DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
LoopInfo *LI = &AM.getResult<LoopAnalysis>(F);
ScalarEvolution *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F);
@@ -163,7 +176,7 @@ PreservedAnalyses LoopDataPrefetchPass::run(Function &F,
&AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
const TargetTransformInfo *TTI = &AM.getResult<TargetIRAnalysis>(F);
- LoopDataPrefetch LDP(AC, LI, SE, TTI, ORE);
+ LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE);
bool Changed = LDP.run();
if (Changed) {
@@ -180,6 +193,7 @@ bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
+ DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
AssumptionCache *AC =
@@ -189,7 +203,7 @@ bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) {
const TargetTransformInfo *TTI =
&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- LoopDataPrefetch LDP(AC, LI, SE, TTI, ORE);
+ LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE);
return LDP.run();
}
@@ -210,6 +224,49 @@ bool LoopDataPrefetch::run() {
return MadeChange;
}
+/// A record for a potential prefetch made during the initial scan of the
+/// loop. This is used to let a single prefetch target multiple memory accesses.
+struct Prefetch {
+ /// The address formula for this prefetch as returned by ScalarEvolution.
+ const SCEVAddRecExpr *LSCEVAddRec;
+ /// The point of insertion for the prefetch instruction.
+ Instruction *InsertPt;
+ /// True if targeting a write memory access.
+ bool Writes;
+ /// The (first seen) prefetched instruction.
+ Instruction *MemI;
+
+ /// Constructor to create a new Prefetch for \p I.
+ Prefetch(const SCEVAddRecExpr *L, Instruction *I)
+ : LSCEVAddRec(L), InsertPt(nullptr), Writes(false), MemI(nullptr) {
+ addInstruction(I);
+ };
+
+ /// Add the instruction \param I to this prefetch. If it's not the first
+ /// one, 'InsertPt' and 'Writes' will be updated as required.
+ /// \param PtrDiff the known constant address difference to the first added
+ /// instruction.
+ void addInstruction(Instruction *I, DominatorTree *DT = nullptr,
+ int64_t PtrDiff = 0) {
+ if (!InsertPt) {
+ MemI = I;
+ InsertPt = I;
+ Writes = isa<StoreInst>(I);
+ } else {
+ BasicBlock *PrefBB = InsertPt->getParent();
+ BasicBlock *InsBB = I->getParent();
+ if (PrefBB != InsBB) {
+ BasicBlock *DomBB = DT->findNearestCommonDominator(PrefBB, InsBB);
+ if (DomBB != PrefBB)
+ InsertPt = DomBB->getTerminator();
+ }
+
+ if (isa<StoreInst>(I) && PtrDiff == 0)
+ Writes = true;
+ }
+ }
+};
+
bool LoopDataPrefetch::runOnLoop(Loop *L) {
bool MadeChange = false;
@@ -222,15 +279,22 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
// Calculate the number of iterations ahead to prefetch
CodeMetrics Metrics;
+ bool HasCall = false;
for (const auto BB : L->blocks()) {
// If the loop already has prefetches, then assume that the user knows
// what they are doing and don't add any more.
- for (auto &I : *BB)
- if (CallInst *CI = dyn_cast<CallInst>(&I))
- if (Function *F = CI->getCalledFunction())
+ for (auto &I : *BB) {
+ if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
+ if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
if (F->getIntrinsicID() == Intrinsic::prefetch)
return MadeChange;
-
+ if (TTI->isLoweredToCall(F))
+ HasCall = true;
+ } else { // indirect call.
+ HasCall = true;
+ }
+ }
+ }
Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
}
unsigned LoopSize = Metrics.NumInsts;
@@ -244,12 +308,14 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
if (ItersAhead > getMaxPrefetchIterationsAhead())
return MadeChange;
- LLVM_DEBUG(dbgs() << "Prefetching " << ItersAhead
- << " iterations ahead (loop size: " << LoopSize << ") in "
- << L->getHeader()->getParent()->getName() << ": " << *L);
+ unsigned ConstantMaxTripCount = SE->getSmallConstantMaxTripCount(L);
+ if (ConstantMaxTripCount && ConstantMaxTripCount < ItersAhead + 1)
+ return MadeChange;
- SmallVector<std::pair<Instruction *, const SCEVAddRecExpr *>, 16> PrefLoads;
- for (const auto BB : L->blocks()) {
+ unsigned NumMemAccesses = 0;
+ unsigned NumStridedMemAccesses = 0;
+ SmallVector<Prefetch, 16> Prefetches;
+ for (const auto BB : L->blocks())
for (auto &I : *BB) {
Value *PtrValue;
Instruction *MemI;
@@ -258,7 +324,7 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
MemI = LMemI;
PtrValue = LMemI->getPointerOperand();
} else if (StoreInst *SMemI = dyn_cast<StoreInst>(&I)) {
- if (!PrefetchWrites) continue;
+ if (!doPrefetchWrites()) continue;
MemI = SMemI;
PtrValue = SMemI->getPointerOperand();
} else continue;
@@ -266,7 +332,7 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace();
if (PtrAddrSpace)
continue;
-
+ NumMemAccesses++;
if (L->isLoopInvariant(PtrValue))
continue;
@@ -274,62 +340,79 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
if (!LSCEVAddRec)
continue;
+ NumStridedMemAccesses++;
- // Check if the stride of the accesses is large enough to warrant a
- // prefetch.
- if (!isStrideLargeEnough(LSCEVAddRec))
- continue;
-
- // We don't want to double prefetch individual cache lines. If this load
- // is known to be within one cache line of some other load that has
- // already been prefetched, then don't prefetch this one as well.
+ // We don't want to double prefetch individual cache lines. If this
+ // access is known to be within one cache line of some other one that
+ // has already been prefetched, then don't prefetch this one as well.
bool DupPref = false;
- for (const auto &PrefLoad : PrefLoads) {
- const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, PrefLoad.second);
+ for (auto &Pref : Prefetches) {
+ const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, Pref.LSCEVAddRec);
if (const SCEVConstant *ConstPtrDiff =
dyn_cast<SCEVConstant>(PtrDiff)) {
int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue());
if (PD < (int64_t) TTI->getCacheLineSize()) {
+ Pref.addInstruction(MemI, DT, PD);
DupPref = true;
break;
}
}
}
- if (DupPref)
- continue;
+ if (!DupPref)
+ Prefetches.push_back(Prefetch(LSCEVAddRec, MemI));
+ }
- const SCEV *NextLSCEV = SE->getAddExpr(LSCEVAddRec, SE->getMulExpr(
- SE->getConstant(LSCEVAddRec->getType(), ItersAhead),
- LSCEVAddRec->getStepRecurrence(*SE)));
- if (!isSafeToExpand(NextLSCEV, *SE))
- continue;
+ unsigned TargetMinStride =
+ getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
+ Prefetches.size(), HasCall);
- PrefLoads.push_back(std::make_pair(MemI, LSCEVAddRec));
-
- Type *I8Ptr = Type::getInt8PtrTy(BB->getContext(), PtrAddrSpace);
- SCEVExpander SCEVE(*SE, I.getModule()->getDataLayout(), "prefaddr");
- Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, MemI);
-
- IRBuilder<> Builder(MemI);
- Module *M = BB->getParent()->getParent();
- Type *I32 = Type::getInt32Ty(BB->getContext());
- Function *PrefetchFunc = Intrinsic::getDeclaration(
- M, Intrinsic::prefetch, PrefPtrValue->getType());
- Builder.CreateCall(
- PrefetchFunc,
- {PrefPtrValue,
- ConstantInt::get(I32, MemI->mayReadFromMemory() ? 0 : 1),
- ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)});
- ++NumPrefetches;
- LLVM_DEBUG(dbgs() << " Access: " << *PtrValue << ", SCEV: " << *LSCEV
- << "\n");
- ORE->emit([&]() {
- return OptimizationRemark(DEBUG_TYPE, "Prefetched", MemI)
- << "prefetched memory access";
+ LLVM_DEBUG(dbgs() << "Prefetching " << ItersAhead
+ << " iterations ahead (loop size: " << LoopSize << ") in "
+ << L->getHeader()->getParent()->getName() << ": " << *L);
+ LLVM_DEBUG(dbgs() << "Loop has: "
+ << NumMemAccesses << " memory accesses, "
+ << NumStridedMemAccesses << " strided memory accesses, "
+ << Prefetches.size() << " potential prefetch(es), "
+ << "a minimum stride of " << TargetMinStride << ", "
+ << (HasCall ? "calls" : "no calls") << ".\n");
+
+ for (auto &P : Prefetches) {
+ // Check if the stride of the accesses is large enough to warrant a
+ // prefetch.
+ if (!isStrideLargeEnough(P.LSCEVAddRec, TargetMinStride))
+ continue;
+
+ const SCEV *NextLSCEV = SE->getAddExpr(P.LSCEVAddRec, SE->getMulExpr(
+ SE->getConstant(P.LSCEVAddRec->getType(), ItersAhead),
+ P.LSCEVAddRec->getStepRecurrence(*SE)));
+ if (!isSafeToExpand(NextLSCEV, *SE))
+ continue;
+
+ BasicBlock *BB = P.InsertPt->getParent();
+ Type *I8Ptr = Type::getInt8PtrTy(BB->getContext(), 0/*PtrAddrSpace*/);
+ SCEVExpander SCEVE(*SE, BB->getModule()->getDataLayout(), "prefaddr");
+ Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, P.InsertPt);
+
+ IRBuilder<> Builder(P.InsertPt);
+ Module *M = BB->getParent()->getParent();
+ Type *I32 = Type::getInt32Ty(BB->getContext());
+ Function *PrefetchFunc = Intrinsic::getDeclaration(
+ M, Intrinsic::prefetch, PrefPtrValue->getType());
+ Builder.CreateCall(
+ PrefetchFunc,
+ {PrefPtrValue,
+ ConstantInt::get(I32, P.Writes),
+ ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)});
+ ++NumPrefetches;
+ LLVM_DEBUG(dbgs() << " Access: "
+ << *P.MemI->getOperand(isa<LoadInst>(P.MemI) ? 0 : 1)
+ << ", SCEV: " << *P.LSCEVAddRec << "\n");
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "Prefetched", P.MemI)
+ << "prefetched memory access";
});
- MadeChange = true;
- }
+ MadeChange = true;
}
return MadeChange;