diff options
Diffstat (limited to 'llvm/lib/Transforms/Scalar/LoopInterchange.cpp')
| -rw-r--r-- | llvm/lib/Transforms/Scalar/LoopInterchange.cpp | 196 |
1 files changed, 122 insertions, 74 deletions
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index c2b065c4eb31..1d3023d04463 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/DependenceAnalysis.h" +#include "llvm/Analysis/LoopCacheAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopNestAnalysis.h" #include "llvm/Analysis/LoopPass.h" @@ -33,7 +34,6 @@ #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/InitializePasses.h" @@ -44,7 +44,6 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include <cassert> @@ -120,8 +119,6 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, std::vector<char> Dep; Instruction *Src = cast<Instruction>(*I); Instruction *Dst = cast<Instruction>(*J); - if (Src == Dst) - continue; // Ignore Input dependencies. if (isa<LoadInst>(Src) && isa<LoadInst>(Dst)) continue; @@ -270,26 +267,28 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, return true; } -static LoopVector populateWorklist(Loop &L) { +static void populateWorklist(Loop &L, LoopVector &LoopList) { LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: " << L.getHeader()->getParent()->getName() << " Loop: %" << L.getHeader()->getName() << '\n'); - LoopVector LoopList; + assert(LoopList.empty() && "LoopList should initially be empty!"); Loop *CurrentLoop = &L; const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops(); while (!Vec->empty()) { // The current loop has multiple subloops in it hence it is not tightly // nested. // Discard all loops above it added into Worklist. - if (Vec->size() != 1) - return {}; + if (Vec->size() != 1) { + LoopList = {}; + return; + } LoopList.push_back(CurrentLoop); CurrentLoop = Vec->front(); Vec = &CurrentLoop->getSubLoops(); } LoopList.push_back(CurrentLoop); - return LoopList; + return; } namespace { @@ -360,8 +359,10 @@ public: : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {} /// Check if the loop interchange is profitable. - bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId, - CharMatrix &DepMatrix); + bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop, + unsigned InnerLoopId, unsigned OuterLoopId, + CharMatrix &DepMatrix, + const DenseMap<const Loop *, unsigned> &CostMap); private: int getInstrOrderCost(); @@ -412,23 +413,26 @@ struct LoopInterchange { LoopInfo *LI = nullptr; DependenceInfo *DI = nullptr; DominatorTree *DT = nullptr; + std::unique_ptr<CacheCost> CC = nullptr; /// Interface to emit optimization remarks. OptimizationRemarkEmitter *ORE; LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI, - DominatorTree *DT, OptimizationRemarkEmitter *ORE) - : SE(SE), LI(LI), DI(DI), DT(DT), ORE(ORE) {} + DominatorTree *DT, std::unique_ptr<CacheCost> &CC, + OptimizationRemarkEmitter *ORE) + : SE(SE), LI(LI), DI(DI), DT(DT), CC(std::move(CC)), ORE(ORE) {} bool run(Loop *L) { if (L->getParentLoop()) return false; - - return processLoopList(populateWorklist(*L)); + SmallVector<Loop *, 8> LoopList; + populateWorklist(*L, LoopList); + return processLoopList(LoopList); } bool run(LoopNest &LN) { - const auto &LoopList = LN.getLoops(); + SmallVector<Loop *, 8> LoopList(LN.getLoops().begin(), LN.getLoops().end()); for (unsigned I = 1; I < LoopList.size(); ++I) if (LoopList[I]->getParentLoop() != LoopList[I - 1]) return false; @@ -460,7 +464,7 @@ struct LoopInterchange { return LoopList.size() - 1; } - bool processLoopList(ArrayRef<Loop *> LoopList) { + bool processLoopList(SmallVectorImpl<Loop *> &LoopList) { bool Changed = false; unsigned LoopNestDepth = LoopList.size(); if (LoopNestDepth < 2) { @@ -500,27 +504,55 @@ struct LoopInterchange { } unsigned SelecLoopId = selectLoopForInterchange(LoopList); - // Move the selected loop outwards to the best possible position. - Loop *LoopToBeInterchanged = LoopList[SelecLoopId]; - for (unsigned i = SelecLoopId; i > 0; i--) { - bool Interchanged = processLoop(LoopToBeInterchanged, LoopList[i - 1], i, - i - 1, DependencyMatrix); - if (!Interchanged) - return Changed; - // Update the DependencyMatrix - interChangeDependencies(DependencyMatrix, i, i - 1); + // Obtain the loop vector returned from loop cache analysis beforehand, + // and put each <Loop, index> pair into a map for constant time query + // later. Indices in loop vector reprsent the optimal order of the + // corresponding loop, e.g., given a loopnest with depth N, index 0 + // indicates the loop should be placed as the outermost loop and index N + // indicates the loop should be placed as the innermost loop. + // + // For the old pass manager CacheCost would be null. + DenseMap<const Loop *, unsigned> CostMap; + if (CC != nullptr) { + const auto &LoopCosts = CC->getLoopCosts(); + for (unsigned i = 0; i < LoopCosts.size(); i++) { + CostMap[LoopCosts[i].first] = i; + } + } + // We try to achieve the globally optimal memory access for the loopnest, + // and do interchange based on a bubble-sort fasion. We start from + // the innermost loop, move it outwards to the best possible position + // and repeat this process. + for (unsigned j = SelecLoopId; j > 0; j--) { + bool ChangedPerIter = false; + for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) { + bool Interchanged = processLoop(LoopList[i], LoopList[i - 1], i, i - 1, + DependencyMatrix, CostMap); + if (!Interchanged) + continue; + // Loops interchanged, update LoopList accordingly. + std::swap(LoopList[i - 1], LoopList[i]); + // Update the DependencyMatrix + interChangeDependencies(DependencyMatrix, i, i - 1); #ifdef DUMP_DEP_MATRICIES - LLVM_DEBUG(dbgs() << "Dependence after interchange\n"); - printDepMatrix(DependencyMatrix); + LLVM_DEBUG(dbgs() << "Dependence after interchange\n"); + printDepMatrix(DependencyMatrix); #endif - Changed |= Interchanged; + ChangedPerIter |= Interchanged; + Changed |= Interchanged; + } + // Early abort if there was no interchange during an entire round of + // moving loops outwards. + if (!ChangedPerIter) + break; } return Changed; } bool processLoop(Loop *InnerLoop, Loop *OuterLoop, unsigned InnerLoopId, unsigned OuterLoopId, - std::vector<std::vector<char>> &DependencyMatrix) { + std::vector<std::vector<char>> &DependencyMatrix, + const DenseMap<const Loop *, unsigned> &CostMap) { LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId << " and OuterLoopId = " << OuterLoopId << "\n"); LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE); @@ -530,7 +562,8 @@ struct LoopInterchange { } LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n"); LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE); - if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) { + if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId, + DependencyMatrix, CostMap)) { LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n"); return false; } @@ -733,8 +766,12 @@ static PHINode *findInnerReductionPhi(Loop *L, Value *V) { if (PHI->getNumIncomingValues() == 1) continue; RecurrenceDescriptor RD; - if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) + if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) { + // Detect floating point reduction only when it can be reordered. + if (RD.getExactFPMathInst() != nullptr) + return nullptr; return PHI; + } return nullptr; } } @@ -893,28 +930,23 @@ areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL, static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) { BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock(); for (PHINode &PHI : LoopNestExit->phis()) { - // FIXME: We currently are not able to detect floating point reductions - // and have to use floating point PHIs as a proxy to prevent - // interchanging in the presence of floating point reductions. - if (PHI.getType()->isFloatingPointTy()) - return false; for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) { - Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i)); - if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch()) - continue; + Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i)); + if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch()) + continue; - // The incoming value is defined in the outer loop latch. Currently we - // only support that in case the outer loop latch has a single predecessor. - // This guarantees that the outer loop latch is executed if and only if - // the inner loop is executed (because tightlyNested() guarantees that the - // outer loop header only branches to the inner loop or the outer loop - // latch). - // FIXME: We could weaken this logic and allow multiple predecessors, - // if the values are produced outside the loop latch. We would need - // additional logic to update the PHI nodes in the exit block as - // well. - if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr) - return false; + // The incoming value is defined in the outer loop latch. Currently we + // only support that in case the outer loop latch has a single predecessor. + // This guarantees that the outer loop latch is executed if and only if + // the inner loop is executed (because tightlyNested() guarantees that the + // outer loop header only branches to the inner loop or the outer loop + // latch). + // FIXME: We could weaken this logic and allow multiple predecessors, + // if the values are produced outside the loop latch. We would need + // additional logic to update the PHI nodes in the exit block as + // well. + if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr) + return false; } } return true; @@ -1125,21 +1157,33 @@ static bool isProfitableForVectorization(unsigned InnerLoopId, return !DepMatrix.empty(); } -bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, - unsigned OuterLoopId, - CharMatrix &DepMatrix) { - // TODO: Add better profitability checks. - // e.g - // 1) Construct dependency matrix and move the one with no loop carried dep - // inside to enable vectorization. +bool LoopInterchangeProfitability::isProfitable( + const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId, + unsigned OuterLoopId, CharMatrix &DepMatrix, + const DenseMap<const Loop *, unsigned> &CostMap) { + // TODO: Remove the legacy cost model. - // This is rough cost estimation algorithm. It counts the good and bad order - // of induction variables in the instruction and allows reordering if number - // of bad orders is more than good. - int Cost = getInstrOrderCost(); - LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n"); - if (Cost < -LoopInterchangeCostThreshold) - return true; + // This is the new cost model returned from loop cache analysis. + // A smaller index means the loop should be placed an outer loop, and vice + // versa. + if (CostMap.find(InnerLoop) != CostMap.end() && + CostMap.find(OuterLoop) != CostMap.end()) { + unsigned InnerIndex = 0, OuterIndex = 0; + InnerIndex = CostMap.find(InnerLoop)->second; + OuterIndex = CostMap.find(OuterLoop)->second; + LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex + << ", OuterIndex = " << OuterIndex << "\n"); + if (InnerIndex < OuterIndex) + return true; + } else { + // Legacy cost model: this is rough cost estimation algorithm. It counts the + // good and bad order of induction variables in the instruction and allows + // reordering if number of bad orders is more than good. + int Cost = getInstrOrderCost(); + LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n"); + if (Cost < -LoopInterchangeCostThreshold) + return true; + } // It is not profitable as per current cache profitability model. But check if // we can move this loop outside to improve parallelism. @@ -1150,10 +1194,8 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable", InnerLoop->getStartLoc(), InnerLoop->getHeader()) - << "Interchanging loops is too costly (cost=" - << ore::NV("Cost", Cost) << ", threshold=" - << ore::NV("Threshold", LoopInterchangeCostThreshold) - << ") and it does not improve parallelism."; + << "Interchanging loops is too costly and it does not improve " + "parallelism."; }); return false; } @@ -1424,9 +1466,13 @@ static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader, // Incoming values are guaranteed be instructions currently. auto IncI = cast<Instruction>(P.getIncomingValueForBlock(InnerLatch)); + // In case of multi-level nested loops, follow LCSSA to find the incoming + // value defined from the innermost loop. + auto IncIInnerMost = cast<Instruction>(followLCSSA(IncI)); // Skip phis with incoming values from the inner loop body, excluding the // header and latch. - if (IncI->getParent() != InnerLatch && IncI->getParent() != InnerHeader) + if (IncIInnerMost->getParent() != InnerLatch && + IncIInnerMost->getParent() != InnerHeader) continue; assert(all_of(P.users(), @@ -1695,8 +1741,8 @@ struct LoopInterchangeLegacyPass : public LoopPass { auto *DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI(); auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); - - return LoopInterchange(SE, LI, DI, DT, ORE).run(L); + std::unique_ptr<CacheCost> CC = nullptr; + return LoopInterchange(SE, LI, DI, DT, CC, ORE).run(L); } }; } // namespace @@ -1723,8 +1769,10 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN, Function &F = *LN.getParent(); DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI); + std::unique_ptr<CacheCost> CC = + CacheCost::getCacheCost(LN.getOutermostLoop(), AR, DI); OptimizationRemarkEmitter ORE(&F); - if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &ORE).run(LN)) + if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN)) return PreservedAnalyses::all(); return getLoopPassPreservedAnalyses(); } |
