diff options
Diffstat (limited to 'llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp')
-rw-r--r-- | llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp | 533 |
1 files changed, 352 insertions, 181 deletions
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp index f1965934b2d7..dd628f3e7e0c 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp @@ -11,31 +11,54 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Sequence.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Twine.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/DependenceAnalysis.h" -#include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/LoopAnalysisManager.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" -#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/Utils/Local.h" #include "llvm/IR/BasicBlock.h" -#include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/IR/ValueMap.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/GenericDomTree.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" -#include "llvm/Transforms/Utils/SimplifyIndVar.h" #include "llvm/Transforms/Utils/UnrollLoop.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include <assert.h> +#include <memory> +#include <type_traits> +#include <vector> + using namespace llvm; #define DEBUG_TYPE "loop-unroll-and-jam" @@ -47,17 +70,14 @@ typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet; // Partition blocks in an outer/inner loop pair into blocks before and after // the loop -static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop, - BasicBlockSet &ForeBlocks, - BasicBlockSet &SubLoopBlocks, - BasicBlockSet &AftBlocks, - DominatorTree *DT) { +static bool partitionLoopBlocks(Loop &L, BasicBlockSet &ForeBlocks, + BasicBlockSet &AftBlocks, DominatorTree &DT) { + Loop *SubLoop = L.getSubLoops()[0]; BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); - SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end()); - for (BasicBlock *BB : L->blocks()) { + for (BasicBlock *BB : L.blocks()) { if (!SubLoop->contains(BB)) { - if (DT->dominates(SubLoopLatch, BB)) + if (DT.dominates(SubLoopLatch, BB)) AftBlocks.insert(BB); else ForeBlocks.insert(BB); @@ -71,14 +91,44 @@ static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop, if (BB == SubLoopPreHeader) continue; Instruction *TI = BB->getTerminator(); - for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) - if (!ForeBlocks.count(TI->getSuccessor(i))) + for (BasicBlock *Succ : successors(TI)) + if (!ForeBlocks.count(Succ)) return false; } return true; } +/// Partition blocks in a loop nest into blocks before and after each inner +/// loop. +static bool partitionOuterLoopBlocks( + Loop &Root, Loop &JamLoop, BasicBlockSet &JamLoopBlocks, + DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap, + DenseMap<Loop *, BasicBlockSet> &AftBlocksMap, DominatorTree &DT) { + JamLoopBlocks.insert(JamLoop.block_begin(), JamLoop.block_end()); + + for (Loop *L : Root.getLoopsInPreorder()) { + if (L == &JamLoop) + break; + + if (!partitionLoopBlocks(*L, ForeBlocksMap[L], AftBlocksMap[L], DT)) + return false; + } + + return true; +} + +// TODO Remove when UnrollAndJamLoop changed to support unroll and jamming more +// than 2 levels loop. +static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop, + BasicBlockSet &ForeBlocks, + BasicBlockSet &SubLoopBlocks, + BasicBlockSet &AftBlocks, + DominatorTree *DT) { + SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end()); + return partitionLoopBlocks(*L, ForeBlocks, AftBlocks, *DT); +} + // Looks at the phi nodes in Header for values coming from Latch. For these // instructions and all their operands calls Visit on them, keeping going for // all the operands in AftBlocks. Returns false if Visit returns false, @@ -169,10 +219,12 @@ static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header, If EpilogueLoop is non-null, it receives the epilogue loop (if it was necessary to create one and not fully unrolled). */ -LoopUnrollResult llvm::UnrollAndJamLoop( - Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple, - bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, - AssumptionCache *AC, OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) { +LoopUnrollResult +llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount, + unsigned TripMultiple, bool UnrollRemainder, + LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, + AssumptionCache *AC, const TargetTransformInfo *TTI, + OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) { // When we enter here we should have already checked that it is safe BasicBlock *Header = L->getHeader(); @@ -198,7 +250,7 @@ LoopUnrollResult llvm::UnrollAndJamLoop( if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false, /*UseEpilogRemainder*/ true, UnrollRemainder, /*ForgetAllSCEV*/ false, - LI, SE, DT, AC, true, EpilogueLoop)) { + LI, SE, DT, AC, TTI, true, EpilogueLoop)) { LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be " "generated when assuming runtime trip count\n"); return LoopUnrollResult::Unmodified; @@ -284,8 +336,7 @@ LoopUnrollResult llvm::UnrollAndJamLoop( // Move any instructions from fore phi operands from AftBlocks into Fore. moveHeaderPhiOperandsToForeBlocks( - Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(), - AftBlocks); + Header, LatchBlock, ForeBlocksLast[0]->getTerminator(), AftBlocks); // The current on-the-fly SSA update requires blocks to be processed in // reverse postorder so that LastValueMap contains the correct value at each @@ -312,32 +363,32 @@ LoopUnrollResult llvm::UnrollAndJamLoop( // Copy all blocks for (unsigned It = 1; It != Count; ++It) { - std::vector<BasicBlock *> NewBlocks; + SmallVector<BasicBlock *, 8> NewBlocks; // Maps Blocks[It] -> Blocks[It-1] DenseMap<Value *, Value *> PrevItValueMap; + SmallDenseMap<const Loop *, Loop *, 4> NewLoops; + NewLoops[L] = L; + NewLoops[SubLoop] = SubLoop; for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) { ValueToValueMapTy VMap; BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It)); Header->getParent()->getBasicBlockList().push_back(New); - if (ForeBlocks.count(*BB)) { - L->addBasicBlockToLoop(New, *LI); + // Tell LI about New. + addClonedBlockToLoopInfo(*BB, New, LI, NewLoops); + if (ForeBlocks.count(*BB)) { if (*BB == ForeBlocksFirst[0]) ForeBlocksFirst.push_back(New); if (*BB == ForeBlocksLast[0]) ForeBlocksLast.push_back(New); } else if (SubLoopBlocks.count(*BB)) { - SubLoop->addBasicBlockToLoop(New, *LI); - if (*BB == SubLoopBlocksFirst[0]) SubLoopBlocksFirst.push_back(New); if (*BB == SubLoopBlocksLast[0]) SubLoopBlocksLast.push_back(New); } else if (AftBlocks.count(*BB)) { - L->addBasicBlockToLoop(New, *LI); - if (*BB == AftBlocksFirst[0]) AftBlocksFirst.push_back(New); if (*BB == AftBlocksLast[0]) @@ -379,9 +430,9 @@ LoopUnrollResult llvm::UnrollAndJamLoop( } // Remap all instructions in the most recent iteration + remapInstructionsInBlocks(NewBlocks, LastValueMap); for (BasicBlock *NewBlock : NewBlocks) { for (Instruction &I : *NewBlock) { - ::remapInstruction(&I, LastValueMap); if (auto *II = dyn_cast<IntrinsicInst>(&I)) if (II->getIntrinsicID() == Intrinsic::assume) AC->registerAssumption(II); @@ -447,8 +498,8 @@ LoopUnrollResult llvm::UnrollAndJamLoop( // Update ForeBlocks successors and phi nodes BranchInst *ForeTerm = cast<BranchInst>(ForeBlocksLast.back()->getTerminator()); - BasicBlock *Dest = SubLoopBlocksFirst[0]; - ForeTerm->setSuccessor(0, Dest); + assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor"); + ForeTerm->setSuccessor(0, SubLoopBlocksFirst[0]); if (CompletelyUnroll) { while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) { @@ -465,8 +516,8 @@ LoopUnrollResult llvm::UnrollAndJamLoop( // Remap ForeBlock successors from previous iteration to this BranchInst *ForeTerm = cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator()); - BasicBlock *Dest = ForeBlocksFirst[It]; - ForeTerm->setSuccessor(0, Dest); + assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor"); + ForeTerm->setSuccessor(0, ForeBlocksFirst[It]); } // Subloop successors and phis @@ -495,12 +546,14 @@ LoopUnrollResult llvm::UnrollAndJamLoop( } // Aft blocks successors and phis - BranchInst *Term = cast<BranchInst>(AftBlocksLast.back()->getTerminator()); + BranchInst *AftTerm = cast<BranchInst>(AftBlocksLast.back()->getTerminator()); if (CompletelyUnroll) { - BranchInst::Create(LoopExit, Term); - Term->eraseFromParent(); + BranchInst::Create(LoopExit, AftTerm); + AftTerm->eraseFromParent(); } else { - Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]); + AftTerm->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]); + assert(AftTerm->getSuccessor(ContinueOnTrue) == LoopExit && + "Expecting the ContinueOnTrue successor of AftTerm to be LoopExit"); } updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0], SubLoopBlocksLast.back()); @@ -540,55 +593,48 @@ LoopUnrollResult llvm::UnrollAndJamLoop( MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end()); MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end()); MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end()); - while (!MergeBlocks.empty()) { - BasicBlock *BB = *MergeBlocks.begin(); - BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator()); - if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) { - BasicBlock *Dest = Term->getSuccessor(0); - BasicBlock *Fold = Dest->getUniquePredecessor(); - if (MergeBlockIntoPredecessor(Dest, &DTU, LI)) { - // Don't remove BB and add Fold as they are the same BB - assert(Fold == BB); - (void)Fold; - MergeBlocks.erase(Dest); - } else - MergeBlocks.erase(BB); - } else - MergeBlocks.erase(BB); - } + + MergeBlockSuccessorsIntoGivenBlocks(MergeBlocks, L, &DTU, LI); + // Apply updates to the DomTree. DT = &DTU.getDomTree(); // At this point, the code is well formed. We now do a quick sweep over the // inserted code, doing constant propagation and dead code elimination as we // go. - simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC); - simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC); + simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC, TTI); + simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC, + TTI); NumCompletelyUnrolledAndJammed += CompletelyUnroll; ++NumUnrolledAndJammed; + // Update LoopInfo if the loop is completely removed. + if (CompletelyUnroll) + LI->erase(L); + #ifndef NDEBUG // We shouldn't have done anything to break loop simplify form or LCSSA. - Loop *OuterL = L->getParentLoop(); - Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop); + Loop *OutestLoop = SubLoop->getParentLoop() + ? SubLoop->getParentLoop()->getParentLoop() + ? SubLoop->getParentLoop()->getParentLoop() + : SubLoop->getParentLoop() + : SubLoop; + assert(DT->verify()); + LI->verify(*DT); assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI)); if (!CompletelyUnroll) assert(L->isLoopSimplifyForm()); assert(SubLoop->isLoopSimplifyForm()); - assert(DT->verify()); + SE->verify(); #endif - // Update LoopInfo if the loop is completely removed. - if (CompletelyUnroll) - LI->erase(L); - return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled : LoopUnrollResult::PartiallyUnrolled; } static bool getLoadsAndStores(BasicBlockSet &Blocks, - SmallVector<Value *, 4> &MemInstr) { + SmallVector<Instruction *, 4> &MemInstr) { // Scan the BBs and collect legal loads and stores. // Returns false if non-simple loads/stores are found. for (BasicBlock *BB : Blocks) { @@ -609,97 +655,235 @@ static bool getLoadsAndStores(BasicBlockSet &Blocks, return true; } -static bool checkDependencies(SmallVector<Value *, 4> &Earlier, - SmallVector<Value *, 4> &Later, - unsigned LoopDepth, bool InnerLoop, - DependenceInfo &DI) { - // Use DA to check for dependencies between loads and stores that make unroll - // and jam invalid - for (Value *I : Earlier) { - for (Value *J : Later) { - Instruction *Src = cast<Instruction>(I); - Instruction *Dst = cast<Instruction>(J); - if (Src == Dst) - continue; - // Ignore Input dependencies. - if (isa<LoadInst>(Src) && isa<LoadInst>(Dst)) - continue; - - // Track dependencies, and if we find them take a conservative approach - // by allowing only = or < (not >), altough some > would be safe - // (depending upon unroll width). - // For the inner loop, we need to disallow any (> <) dependencies - // FIXME: Allow > so long as distance is less than unroll width - if (auto D = DI.depends(Src, Dst, true)) { - assert(D->isOrdered() && "Expected an output, flow or anti dep."); - - if (D->isConfused()) { - LLVM_DEBUG(dbgs() << " Confused dependency between:\n" - << " " << *Src << "\n" - << " " << *Dst << "\n"); +static bool preservesForwardDependence(Instruction *Src, Instruction *Dst, + unsigned UnrollLevel, unsigned JamLevel, + bool Sequentialized, Dependence *D) { + // UnrollLevel might carry the dependency Src --> Dst + // Does a different loop after unrolling? + for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel; + ++CurLoopDepth) { + auto JammedDir = D->getDirection(CurLoopDepth); + if (JammedDir == Dependence::DVEntry::LT) + return true; + + if (JammedDir & Dependence::DVEntry::GT) + return false; + } + + return true; +} + +static bool preservesBackwardDependence(Instruction *Src, Instruction *Dst, + unsigned UnrollLevel, unsigned JamLevel, + bool Sequentialized, Dependence *D) { + // UnrollLevel might carry the dependency Dst --> Src + for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel; + ++CurLoopDepth) { + auto JammedDir = D->getDirection(CurLoopDepth); + if (JammedDir == Dependence::DVEntry::GT) + return true; + + if (JammedDir & Dependence::DVEntry::LT) + return false; + } + + // Backward dependencies are only preserved if not interleaved. + return Sequentialized; +} + +// Check whether it is semantically safe Src and Dst considering any potential +// dependency between them. +// +// @param UnrollLevel The level of the loop being unrolled +// @param JamLevel The level of the loop being jammed; if Src and Dst are on +// different levels, the outermost common loop counts as jammed level +// +// @return true if is safe and false if there is a dependency violation. +static bool checkDependency(Instruction *Src, Instruction *Dst, + unsigned UnrollLevel, unsigned JamLevel, + bool Sequentialized, DependenceInfo &DI) { + assert(UnrollLevel <= JamLevel && + "Expecting JamLevel to be at least UnrollLevel"); + + if (Src == Dst) + return true; + // Ignore Input dependencies. + if (isa<LoadInst>(Src) && isa<LoadInst>(Dst)) + return true; + + // Check whether unroll-and-jam may violate a dependency. + // By construction, every dependency will be lexicographically non-negative + // (if it was, it would violate the current execution order), such as + // (0,0,>,*,*) + // Unroll-and-jam changes the GT execution of two executions to the same + // iteration of the chosen unroll level. That is, a GT dependence becomes a GE + // dependence (or EQ, if we fully unrolled the loop) at the loop's position: + // (0,0,>=,*,*) + // Now, the dependency is not necessarily non-negative anymore, i.e. + // unroll-and-jam may violate correctness. + std::unique_ptr<Dependence> D = DI.depends(Src, Dst, true); + if (!D) + return true; + assert(D->isOrdered() && "Expected an output, flow or anti dep."); + + if (D->isConfused()) { + LLVM_DEBUG(dbgs() << " Confused dependency between:\n" + << " " << *Src << "\n" + << " " << *Dst << "\n"); + return false; + } + + // If outer levels (levels enclosing the loop being unroll-and-jammed) have a + // non-equal direction, then the locations accessed in the inner levels cannot + // overlap in memory. We assumes the indexes never overlap into neighboring + // dimensions. + for (unsigned CurLoopDepth = 1; CurLoopDepth < UnrollLevel; ++CurLoopDepth) + if (!(D->getDirection(CurLoopDepth) & Dependence::DVEntry::EQ)) + return true; + + auto UnrollDirection = D->getDirection(UnrollLevel); + + // If the distance carried by the unrolled loop is 0, then after unrolling + // that distance will become non-zero resulting in non-overlapping accesses in + // the inner loops. + if (UnrollDirection == Dependence::DVEntry::EQ) + return true; + + if (UnrollDirection & Dependence::DVEntry::LT && + !preservesForwardDependence(Src, Dst, UnrollLevel, JamLevel, + Sequentialized, D.get())) + return false; + + if (UnrollDirection & Dependence::DVEntry::GT && + !preservesBackwardDependence(Src, Dst, UnrollLevel, JamLevel, + Sequentialized, D.get())) + return false; + + return true; +} + +static bool +checkDependencies(Loop &Root, const BasicBlockSet &SubLoopBlocks, + const DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap, + const DenseMap<Loop *, BasicBlockSet> &AftBlocksMap, + DependenceInfo &DI, LoopInfo &LI) { + SmallVector<BasicBlockSet, 8> AllBlocks; + for (Loop *L : Root.getLoopsInPreorder()) + if (ForeBlocksMap.find(L) != ForeBlocksMap.end()) + AllBlocks.push_back(ForeBlocksMap.lookup(L)); + AllBlocks.push_back(SubLoopBlocks); + for (Loop *L : Root.getLoopsInPreorder()) + if (AftBlocksMap.find(L) != AftBlocksMap.end()) + AllBlocks.push_back(AftBlocksMap.lookup(L)); + + unsigned LoopDepth = Root.getLoopDepth(); + SmallVector<Instruction *, 4> EarlierLoadsAndStores; + SmallVector<Instruction *, 4> CurrentLoadsAndStores; + for (BasicBlockSet &Blocks : AllBlocks) { + CurrentLoadsAndStores.clear(); + if (!getLoadsAndStores(Blocks, CurrentLoadsAndStores)) + return false; + + Loop *CurLoop = LI.getLoopFor((*Blocks.begin())->front().getParent()); + unsigned CurLoopDepth = CurLoop->getLoopDepth(); + + for (auto *Earlier : EarlierLoadsAndStores) { + Loop *EarlierLoop = LI.getLoopFor(Earlier->getParent()); + unsigned EarlierDepth = EarlierLoop->getLoopDepth(); + unsigned CommonLoopDepth = std::min(EarlierDepth, CurLoopDepth); + for (auto *Later : CurrentLoadsAndStores) { + if (!checkDependency(Earlier, Later, LoopDepth, CommonLoopDepth, false, + DI)) return false; - } - if (!InnerLoop) { - if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT) { - LLVM_DEBUG(dbgs() << " > dependency between:\n" - << " " << *Src << "\n" - << " " << *Dst << "\n"); - return false; - } - } else { - assert(LoopDepth + 1 <= D->getLevels()); - if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT && - D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT) { - LLVM_DEBUG(dbgs() << " < > dependency between:\n" - << " " << *Src << "\n" - << " " << *Dst << "\n"); - return false; - } - } } } + + size_t NumInsts = CurrentLoadsAndStores.size(); + for (size_t I = 0; I < NumInsts; ++I) { + for (size_t J = I; J < NumInsts; ++J) { + if (!checkDependency(CurrentLoadsAndStores[I], CurrentLoadsAndStores[J], + LoopDepth, CurLoopDepth, true, DI)) + return false; + } + } + + EarlierLoadsAndStores.append(CurrentLoadsAndStores.begin(), + CurrentLoadsAndStores.end()); } return true; } -static bool checkDependencies(Loop *L, BasicBlockSet &ForeBlocks, - BasicBlockSet &SubLoopBlocks, - BasicBlockSet &AftBlocks, DependenceInfo &DI) { - // Get all loads/store pairs for each blocks - SmallVector<Value *, 4> ForeMemInstr; - SmallVector<Value *, 4> SubLoopMemInstr; - SmallVector<Value *, 4> AftMemInstr; - if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) || - !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) || - !getLoadsAndStores(AftBlocks, AftMemInstr)) +static bool isEligibleLoopForm(const Loop &Root) { + // Root must have a child. + if (Root.getSubLoops().size() != 1) return false; - // Check for dependencies between any blocks that may change order - unsigned LoopDepth = L->getLoopDepth(); - return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false, - DI) && - checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) && - checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false, - DI) && - checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true, - DI); + const Loop *L = &Root; + do { + // All loops in Root need to be in simplify and rotated form. + if (!L->isLoopSimplifyForm()) + return false; + + if (!L->isRotatedForm()) + return false; + + if (L->getHeader()->hasAddressTaken()) { + LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n"); + return false; + } + + unsigned SubLoopsSize = L->getSubLoops().size(); + if (SubLoopsSize == 0) + return true; + + // Only one child is allowed. + if (SubLoopsSize != 1) + return false; + + L = L->getSubLoops()[0]; + } while (L); + + return true; +} + +static Loop *getInnerMostLoop(Loop *L) { + while (!L->getSubLoops().empty()) + L = L->getSubLoops()[0]; + return L; } bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, - DependenceInfo &DI) { + DependenceInfo &DI, LoopInfo &LI) { + if (!isEligibleLoopForm(*L)) { + LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Ineligible loop form\n"); + return false; + } + /* We currently handle outer loops like this: | - ForeFirst <----\ } - Blocks | } ForeBlocks - ForeLast | } - | | - SubLoopFirst <\ | } - Blocks | | } SubLoopBlocks - SubLoopLast -/ | } - | | - AftFirst | } - Blocks | } AftBlocks - AftLast ------/ } + ForeFirst <------\ } + Blocks | } ForeBlocks of L + ForeLast | } + | | + ... | + | | + ForeFirst <----\ | } + Blocks | | } ForeBlocks of a inner loop of L + ForeLast | | } + | | | + JamLoopFirst <\ | | } + Blocks | | | } JamLoopBlocks of the innermost loop + JamLoopLast -/ | | } + | | | + AftFirst | | } + Blocks | | } AftBlocks of a inner loop of L + AftLast ------/ | } + | | + ... | + | | + AftFirst | } + Blocks | } AftBlocks of L + AftLast --------/ } | There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks @@ -709,14 +893,16 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, things further in the profitablility checks of the unroll and jam pass. Because of the way we rearrange basic blocks, we also require that - the Fore blocks on all unrolled iterations are safe to move before the - SubLoop blocks of all iterations. So we require that the phi node looping - operands of ForeHeader can be moved to at least the end of ForeEnd, so that - we can arrange cloned Fore Blocks before the subloop and match up Phi's - correctly. + the Fore blocks of L on all unrolled iterations are safe to move before the + blocks of the direct child of L of all iterations. So we require that the + phi node looping operands of ForeHeader can be moved to at least the end of + ForeEnd, so that we can arrange cloned Fore Blocks before the subloop and + match up Phi's correctly. - i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2. - It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2. + i.e. The old order of blocks used to be + (F1)1 (F2)1 J1_1 J1_2 (A2)1 (A1)1 (F1)2 (F2)2 J2_1 J2_2 (A2)2 (A1)2. + It needs to be safe to transform this to + (F1)1 (F1)2 (F2)1 (F2)2 J1_1 J1_2 J2_1 J2_2 (A2)1 (A2)2 (A1)1 (A1)2. There are then a number of checks along the lines of no calls, no exceptions, inner loop IV is consistent, etc. Note that for loops requiring @@ -724,35 +910,13 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, UnrollAndJamLoop if the trip count cannot be easily calculated. */ - if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1) - return false; - Loop *SubLoop = L->getSubLoops()[0]; - if (!SubLoop->isLoopSimplifyForm()) - return false; - - BasicBlock *Header = L->getHeader(); - BasicBlock *Latch = L->getLoopLatch(); - BasicBlock *Exit = L->getExitingBlock(); - BasicBlock *SubLoopHeader = SubLoop->getHeader(); - BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); - BasicBlock *SubLoopExit = SubLoop->getExitingBlock(); - - if (Latch != Exit) - return false; - if (SubLoopLatch != SubLoopExit) - return false; - - if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken()) { - LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n"); - return false; - } - // Split blocks into Fore/SubLoop/Aft based on dominators + Loop *JamLoop = getInnerMostLoop(L); BasicBlockSet SubLoopBlocks; - BasicBlockSet ForeBlocks; - BasicBlockSet AftBlocks; - if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, - AftBlocks, &DT)) { + DenseMap<Loop *, BasicBlockSet> ForeBlocksMap; + DenseMap<Loop *, BasicBlockSet> AftBlocksMap; + if (!partitionOuterLoopBlocks(*L, *JamLoop, SubLoopBlocks, ForeBlocksMap, + AftBlocksMap, DT)) { LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n"); return false; } @@ -760,7 +924,7 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, // Aft blocks may need to move instructions to fore blocks, which becomes more // difficult if there are multiple (potentially conditionally executed) // blocks. For now we just exclude loops with multiple aft blocks. - if (AftBlocks.size() != 1) { + if (AftBlocksMap[L].size() != 1) { LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle " "multiple blocks after the loop\n"); return false; @@ -768,7 +932,9 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, // Check inner loop backedge count is consistent on all iterations of the // outer loop - if (!hasIterationCountInvariantInParent(SubLoop, SE)) { + if (any_of(L->getLoopsInPreorder(), [&SE](Loop *SubLoop) { + return !hasIterationCountInvariantInParent(SubLoop, SE); + })) { LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is " "not consistent on each iteration\n"); return false; @@ -789,6 +955,10 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, // ForeBlock phi operands before the subloop // Make sure we can move all instructions we need to before the subloop + BasicBlock *Header = L->getHeader(); + BasicBlock *Latch = L->getLoopLatch(); + BasicBlockSet AftBlocks = AftBlocksMap[L]; + Loop *SubLoop = L->getSubLoops()[0]; if (!processHeaderPhiOperands( Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) { if (SubLoop->contains(I->getParent())) @@ -814,7 +984,8 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, // Check for memory dependencies which prohibit the unrolling we are doing. // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub. - if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI)) { + if (!checkDependencies(*L, SubLoopBlocks, ForeBlocksMap, AftBlocksMap, DI, + LI)) { LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n"); return false; } |