summaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp')
-rw-r--r--llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp533
1 files changed, 352 insertions, 181 deletions
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
index f1965934b2d71..dd628f3e7e0ca 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -11,31 +11,54 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/DependenceAnalysis.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopIterator.h"
-#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MustExecute.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/Utils/Local.h"
#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GenericDomTree.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/LoopSimplify.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/SimplifyIndVar.h"
#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <assert.h>
+#include <memory>
+#include <type_traits>
+#include <vector>
+
using namespace llvm;
#define DEBUG_TYPE "loop-unroll-and-jam"
@@ -47,17 +70,14 @@ typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet;
// Partition blocks in an outer/inner loop pair into blocks before and after
// the loop
-static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
- BasicBlockSet &ForeBlocks,
- BasicBlockSet &SubLoopBlocks,
- BasicBlockSet &AftBlocks,
- DominatorTree *DT) {
+static bool partitionLoopBlocks(Loop &L, BasicBlockSet &ForeBlocks,
+ BasicBlockSet &AftBlocks, DominatorTree &DT) {
+ Loop *SubLoop = L.getSubLoops()[0];
BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
- SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
- for (BasicBlock *BB : L->blocks()) {
+ for (BasicBlock *BB : L.blocks()) {
if (!SubLoop->contains(BB)) {
- if (DT->dominates(SubLoopLatch, BB))
+ if (DT.dominates(SubLoopLatch, BB))
AftBlocks.insert(BB);
else
ForeBlocks.insert(BB);
@@ -71,14 +91,44 @@ static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
if (BB == SubLoopPreHeader)
continue;
Instruction *TI = BB->getTerminator();
- for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
- if (!ForeBlocks.count(TI->getSuccessor(i)))
+ for (BasicBlock *Succ : successors(TI))
+ if (!ForeBlocks.count(Succ))
return false;
}
return true;
}
+/// Partition blocks in a loop nest into blocks before and after each inner
+/// loop.
+static bool partitionOuterLoopBlocks(
+ Loop &Root, Loop &JamLoop, BasicBlockSet &JamLoopBlocks,
+ DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap,
+ DenseMap<Loop *, BasicBlockSet> &AftBlocksMap, DominatorTree &DT) {
+ JamLoopBlocks.insert(JamLoop.block_begin(), JamLoop.block_end());
+
+ for (Loop *L : Root.getLoopsInPreorder()) {
+ if (L == &JamLoop)
+ break;
+
+ if (!partitionLoopBlocks(*L, ForeBlocksMap[L], AftBlocksMap[L], DT))
+ return false;
+ }
+
+ return true;
+}
+
+// TODO Remove when UnrollAndJamLoop changed to support unroll and jamming more
+// than 2 levels loop.
+static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
+ BasicBlockSet &ForeBlocks,
+ BasicBlockSet &SubLoopBlocks,
+ BasicBlockSet &AftBlocks,
+ DominatorTree *DT) {
+ SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
+ return partitionLoopBlocks(*L, ForeBlocks, AftBlocks, *DT);
+}
+
// Looks at the phi nodes in Header for values coming from Latch. For these
// instructions and all their operands calls Visit on them, keeping going for
// all the operands in AftBlocks. Returns false if Visit returns false,
@@ -169,10 +219,12 @@ static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header,
If EpilogueLoop is non-null, it receives the epilogue loop (if it was
necessary to create one and not fully unrolled).
*/
-LoopUnrollResult llvm::UnrollAndJamLoop(
- Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple,
- bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
- AssumptionCache *AC, OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) {
+LoopUnrollResult
+llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
+ unsigned TripMultiple, bool UnrollRemainder,
+ LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
+ AssumptionCache *AC, const TargetTransformInfo *TTI,
+ OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) {
// When we enter here we should have already checked that it is safe
BasicBlock *Header = L->getHeader();
@@ -198,7 +250,7 @@ LoopUnrollResult llvm::UnrollAndJamLoop(
if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
/*UseEpilogRemainder*/ true,
UnrollRemainder, /*ForgetAllSCEV*/ false,
- LI, SE, DT, AC, true, EpilogueLoop)) {
+ LI, SE, DT, AC, TTI, true, EpilogueLoop)) {
LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
"generated when assuming runtime trip count\n");
return LoopUnrollResult::Unmodified;
@@ -284,8 +336,7 @@ LoopUnrollResult llvm::UnrollAndJamLoop(
// Move any instructions from fore phi operands from AftBlocks into Fore.
moveHeaderPhiOperandsToForeBlocks(
- Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(),
- AftBlocks);
+ Header, LatchBlock, ForeBlocksLast[0]->getTerminator(), AftBlocks);
// The current on-the-fly SSA update requires blocks to be processed in
// reverse postorder so that LastValueMap contains the correct value at each
@@ -312,32 +363,32 @@ LoopUnrollResult llvm::UnrollAndJamLoop(
// Copy all blocks
for (unsigned It = 1; It != Count; ++It) {
- std::vector<BasicBlock *> NewBlocks;
+ SmallVector<BasicBlock *, 8> NewBlocks;
// Maps Blocks[It] -> Blocks[It-1]
DenseMap<Value *, Value *> PrevItValueMap;
+ SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
+ NewLoops[L] = L;
+ NewLoops[SubLoop] = SubLoop;
for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
ValueToValueMapTy VMap;
BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
Header->getParent()->getBasicBlockList().push_back(New);
- if (ForeBlocks.count(*BB)) {
- L->addBasicBlockToLoop(New, *LI);
+ // Tell LI about New.
+ addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
+ if (ForeBlocks.count(*BB)) {
if (*BB == ForeBlocksFirst[0])
ForeBlocksFirst.push_back(New);
if (*BB == ForeBlocksLast[0])
ForeBlocksLast.push_back(New);
} else if (SubLoopBlocks.count(*BB)) {
- SubLoop->addBasicBlockToLoop(New, *LI);
-
if (*BB == SubLoopBlocksFirst[0])
SubLoopBlocksFirst.push_back(New);
if (*BB == SubLoopBlocksLast[0])
SubLoopBlocksLast.push_back(New);
} else if (AftBlocks.count(*BB)) {
- L->addBasicBlockToLoop(New, *LI);
-
if (*BB == AftBlocksFirst[0])
AftBlocksFirst.push_back(New);
if (*BB == AftBlocksLast[0])
@@ -379,9 +430,9 @@ LoopUnrollResult llvm::UnrollAndJamLoop(
}
// Remap all instructions in the most recent iteration
+ remapInstructionsInBlocks(NewBlocks, LastValueMap);
for (BasicBlock *NewBlock : NewBlocks) {
for (Instruction &I : *NewBlock) {
- ::remapInstruction(&I, LastValueMap);
if (auto *II = dyn_cast<IntrinsicInst>(&I))
if (II->getIntrinsicID() == Intrinsic::assume)
AC->registerAssumption(II);
@@ -447,8 +498,8 @@ LoopUnrollResult llvm::UnrollAndJamLoop(
// Update ForeBlocks successors and phi nodes
BranchInst *ForeTerm =
cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
- BasicBlock *Dest = SubLoopBlocksFirst[0];
- ForeTerm->setSuccessor(0, Dest);
+ assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor");
+ ForeTerm->setSuccessor(0, SubLoopBlocksFirst[0]);
if (CompletelyUnroll) {
while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
@@ -465,8 +516,8 @@ LoopUnrollResult llvm::UnrollAndJamLoop(
// Remap ForeBlock successors from previous iteration to this
BranchInst *ForeTerm =
cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
- BasicBlock *Dest = ForeBlocksFirst[It];
- ForeTerm->setSuccessor(0, Dest);
+ assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor");
+ ForeTerm->setSuccessor(0, ForeBlocksFirst[It]);
}
// Subloop successors and phis
@@ -495,12 +546,14 @@ LoopUnrollResult llvm::UnrollAndJamLoop(
}
// Aft blocks successors and phis
- BranchInst *Term = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
+ BranchInst *AftTerm = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
if (CompletelyUnroll) {
- BranchInst::Create(LoopExit, Term);
- Term->eraseFromParent();
+ BranchInst::Create(LoopExit, AftTerm);
+ AftTerm->eraseFromParent();
} else {
- Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
+ AftTerm->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
+ assert(AftTerm->getSuccessor(ContinueOnTrue) == LoopExit &&
+ "Expecting the ContinueOnTrue successor of AftTerm to be LoopExit");
}
updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0],
SubLoopBlocksLast.back());
@@ -540,55 +593,48 @@ LoopUnrollResult llvm::UnrollAndJamLoop(
MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
- while (!MergeBlocks.empty()) {
- BasicBlock *BB = *MergeBlocks.begin();
- BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
- if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) {
- BasicBlock *Dest = Term->getSuccessor(0);
- BasicBlock *Fold = Dest->getUniquePredecessor();
- if (MergeBlockIntoPredecessor(Dest, &DTU, LI)) {
- // Don't remove BB and add Fold as they are the same BB
- assert(Fold == BB);
- (void)Fold;
- MergeBlocks.erase(Dest);
- } else
- MergeBlocks.erase(BB);
- } else
- MergeBlocks.erase(BB);
- }
+
+ MergeBlockSuccessorsIntoGivenBlocks(MergeBlocks, L, &DTU, LI);
+
// Apply updates to the DomTree.
DT = &DTU.getDomTree();
// At this point, the code is well formed. We now do a quick sweep over the
// inserted code, doing constant propagation and dead code elimination as we
// go.
- simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC);
- simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC);
+ simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC, TTI);
+ simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC,
+ TTI);
NumCompletelyUnrolledAndJammed += CompletelyUnroll;
++NumUnrolledAndJammed;
+ // Update LoopInfo if the loop is completely removed.
+ if (CompletelyUnroll)
+ LI->erase(L);
+
#ifndef NDEBUG
// We shouldn't have done anything to break loop simplify form or LCSSA.
- Loop *OuterL = L->getParentLoop();
- Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop);
+ Loop *OutestLoop = SubLoop->getParentLoop()
+ ? SubLoop->getParentLoop()->getParentLoop()
+ ? SubLoop->getParentLoop()->getParentLoop()
+ : SubLoop->getParentLoop()
+ : SubLoop;
+ assert(DT->verify());
+ LI->verify(*DT);
assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
if (!CompletelyUnroll)
assert(L->isLoopSimplifyForm());
assert(SubLoop->isLoopSimplifyForm());
- assert(DT->verify());
+ SE->verify();
#endif
- // Update LoopInfo if the loop is completely removed.
- if (CompletelyUnroll)
- LI->erase(L);
-
return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
: LoopUnrollResult::PartiallyUnrolled;
}
static bool getLoadsAndStores(BasicBlockSet &Blocks,
- SmallVector<Value *, 4> &MemInstr) {
+ SmallVector<Instruction *, 4> &MemInstr) {
// Scan the BBs and collect legal loads and stores.
// Returns false if non-simple loads/stores are found.
for (BasicBlock *BB : Blocks) {
@@ -609,97 +655,235 @@ static bool getLoadsAndStores(BasicBlockSet &Blocks,
return true;
}
-static bool checkDependencies(SmallVector<Value *, 4> &Earlier,
- SmallVector<Value *, 4> &Later,
- unsigned LoopDepth, bool InnerLoop,
- DependenceInfo &DI) {
- // Use DA to check for dependencies between loads and stores that make unroll
- // and jam invalid
- for (Value *I : Earlier) {
- for (Value *J : Later) {
- Instruction *Src = cast<Instruction>(I);
- Instruction *Dst = cast<Instruction>(J);
- if (Src == Dst)
- continue;
- // Ignore Input dependencies.
- if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
- continue;
-
- // Track dependencies, and if we find them take a conservative approach
- // by allowing only = or < (not >), altough some > would be safe
- // (depending upon unroll width).
- // For the inner loop, we need to disallow any (> <) dependencies
- // FIXME: Allow > so long as distance is less than unroll width
- if (auto D = DI.depends(Src, Dst, true)) {
- assert(D->isOrdered() && "Expected an output, flow or anti dep.");
-
- if (D->isConfused()) {
- LLVM_DEBUG(dbgs() << " Confused dependency between:\n"
- << " " << *Src << "\n"
- << " " << *Dst << "\n");
+static bool preservesForwardDependence(Instruction *Src, Instruction *Dst,
+ unsigned UnrollLevel, unsigned JamLevel,
+ bool Sequentialized, Dependence *D) {
+ // UnrollLevel might carry the dependency Src --> Dst
+ // Does a different loop after unrolling?
+ for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel;
+ ++CurLoopDepth) {
+ auto JammedDir = D->getDirection(CurLoopDepth);
+ if (JammedDir == Dependence::DVEntry::LT)
+ return true;
+
+ if (JammedDir & Dependence::DVEntry::GT)
+ return false;
+ }
+
+ return true;
+}
+
+static bool preservesBackwardDependence(Instruction *Src, Instruction *Dst,
+ unsigned UnrollLevel, unsigned JamLevel,
+ bool Sequentialized, Dependence *D) {
+ // UnrollLevel might carry the dependency Dst --> Src
+ for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel;
+ ++CurLoopDepth) {
+ auto JammedDir = D->getDirection(CurLoopDepth);
+ if (JammedDir == Dependence::DVEntry::GT)
+ return true;
+
+ if (JammedDir & Dependence::DVEntry::LT)
+ return false;
+ }
+
+ // Backward dependencies are only preserved if not interleaved.
+ return Sequentialized;
+}
+
+// Check whether it is semantically safe Src and Dst considering any potential
+// dependency between them.
+//
+// @param UnrollLevel The level of the loop being unrolled
+// @param JamLevel The level of the loop being jammed; if Src and Dst are on
+// different levels, the outermost common loop counts as jammed level
+//
+// @return true if is safe and false if there is a dependency violation.
+static bool checkDependency(Instruction *Src, Instruction *Dst,
+ unsigned UnrollLevel, unsigned JamLevel,
+ bool Sequentialized, DependenceInfo &DI) {
+ assert(UnrollLevel <= JamLevel &&
+ "Expecting JamLevel to be at least UnrollLevel");
+
+ if (Src == Dst)
+ return true;
+ // Ignore Input dependencies.
+ if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
+ return true;
+
+ // Check whether unroll-and-jam may violate a dependency.
+ // By construction, every dependency will be lexicographically non-negative
+ // (if it was, it would violate the current execution order), such as
+ // (0,0,>,*,*)
+ // Unroll-and-jam changes the GT execution of two executions to the same
+ // iteration of the chosen unroll level. That is, a GT dependence becomes a GE
+ // dependence (or EQ, if we fully unrolled the loop) at the loop's position:
+ // (0,0,>=,*,*)
+ // Now, the dependency is not necessarily non-negative anymore, i.e.
+ // unroll-and-jam may violate correctness.
+ std::unique_ptr<Dependence> D = DI.depends(Src, Dst, true);
+ if (!D)
+ return true;
+ assert(D->isOrdered() && "Expected an output, flow or anti dep.");
+
+ if (D->isConfused()) {
+ LLVM_DEBUG(dbgs() << " Confused dependency between:\n"
+ << " " << *Src << "\n"
+ << " " << *Dst << "\n");
+ return false;
+ }
+
+ // If outer levels (levels enclosing the loop being unroll-and-jammed) have a
+ // non-equal direction, then the locations accessed in the inner levels cannot
+ // overlap in memory. We assumes the indexes never overlap into neighboring
+ // dimensions.
+ for (unsigned CurLoopDepth = 1; CurLoopDepth < UnrollLevel; ++CurLoopDepth)
+ if (!(D->getDirection(CurLoopDepth) & Dependence::DVEntry::EQ))
+ return true;
+
+ auto UnrollDirection = D->getDirection(UnrollLevel);
+
+ // If the distance carried by the unrolled loop is 0, then after unrolling
+ // that distance will become non-zero resulting in non-overlapping accesses in
+ // the inner loops.
+ if (UnrollDirection == Dependence::DVEntry::EQ)
+ return true;
+
+ if (UnrollDirection & Dependence::DVEntry::LT &&
+ !preservesForwardDependence(Src, Dst, UnrollLevel, JamLevel,
+ Sequentialized, D.get()))
+ return false;
+
+ if (UnrollDirection & Dependence::DVEntry::GT &&
+ !preservesBackwardDependence(Src, Dst, UnrollLevel, JamLevel,
+ Sequentialized, D.get()))
+ return false;
+
+ return true;
+}
+
+static bool
+checkDependencies(Loop &Root, const BasicBlockSet &SubLoopBlocks,
+ const DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap,
+ const DenseMap<Loop *, BasicBlockSet> &AftBlocksMap,
+ DependenceInfo &DI, LoopInfo &LI) {
+ SmallVector<BasicBlockSet, 8> AllBlocks;
+ for (Loop *L : Root.getLoopsInPreorder())
+ if (ForeBlocksMap.find(L) != ForeBlocksMap.end())
+ AllBlocks.push_back(ForeBlocksMap.lookup(L));
+ AllBlocks.push_back(SubLoopBlocks);
+ for (Loop *L : Root.getLoopsInPreorder())
+ if (AftBlocksMap.find(L) != AftBlocksMap.end())
+ AllBlocks.push_back(AftBlocksMap.lookup(L));
+
+ unsigned LoopDepth = Root.getLoopDepth();
+ SmallVector<Instruction *, 4> EarlierLoadsAndStores;
+ SmallVector<Instruction *, 4> CurrentLoadsAndStores;
+ for (BasicBlockSet &Blocks : AllBlocks) {
+ CurrentLoadsAndStores.clear();
+ if (!getLoadsAndStores(Blocks, CurrentLoadsAndStores))
+ return false;
+
+ Loop *CurLoop = LI.getLoopFor((*Blocks.begin())->front().getParent());
+ unsigned CurLoopDepth = CurLoop->getLoopDepth();
+
+ for (auto *Earlier : EarlierLoadsAndStores) {
+ Loop *EarlierLoop = LI.getLoopFor(Earlier->getParent());
+ unsigned EarlierDepth = EarlierLoop->getLoopDepth();
+ unsigned CommonLoopDepth = std::min(EarlierDepth, CurLoopDepth);
+ for (auto *Later : CurrentLoadsAndStores) {
+ if (!checkDependency(Earlier, Later, LoopDepth, CommonLoopDepth, false,
+ DI))
return false;
- }
- if (!InnerLoop) {
- if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT) {
- LLVM_DEBUG(dbgs() << " > dependency between:\n"
- << " " << *Src << "\n"
- << " " << *Dst << "\n");
- return false;
- }
- } else {
- assert(LoopDepth + 1 <= D->getLevels());
- if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT &&
- D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT) {
- LLVM_DEBUG(dbgs() << " < > dependency between:\n"
- << " " << *Src << "\n"
- << " " << *Dst << "\n");
- return false;
- }
- }
}
}
+
+ size_t NumInsts = CurrentLoadsAndStores.size();
+ for (size_t I = 0; I < NumInsts; ++I) {
+ for (size_t J = I; J < NumInsts; ++J) {
+ if (!checkDependency(CurrentLoadsAndStores[I], CurrentLoadsAndStores[J],
+ LoopDepth, CurLoopDepth, true, DI))
+ return false;
+ }
+ }
+
+ EarlierLoadsAndStores.append(CurrentLoadsAndStores.begin(),
+ CurrentLoadsAndStores.end());
}
return true;
}
-static bool checkDependencies(Loop *L, BasicBlockSet &ForeBlocks,
- BasicBlockSet &SubLoopBlocks,
- BasicBlockSet &AftBlocks, DependenceInfo &DI) {
- // Get all loads/store pairs for each blocks
- SmallVector<Value *, 4> ForeMemInstr;
- SmallVector<Value *, 4> SubLoopMemInstr;
- SmallVector<Value *, 4> AftMemInstr;
- if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) ||
- !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) ||
- !getLoadsAndStores(AftBlocks, AftMemInstr))
+static bool isEligibleLoopForm(const Loop &Root) {
+ // Root must have a child.
+ if (Root.getSubLoops().size() != 1)
return false;
- // Check for dependencies between any blocks that may change order
- unsigned LoopDepth = L->getLoopDepth();
- return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false,
- DI) &&
- checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) &&
- checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false,
- DI) &&
- checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true,
- DI);
+ const Loop *L = &Root;
+ do {
+ // All loops in Root need to be in simplify and rotated form.
+ if (!L->isLoopSimplifyForm())
+ return false;
+
+ if (!L->isRotatedForm())
+ return false;
+
+ if (L->getHeader()->hasAddressTaken()) {
+ LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n");
+ return false;
+ }
+
+ unsigned SubLoopsSize = L->getSubLoops().size();
+ if (SubLoopsSize == 0)
+ return true;
+
+ // Only one child is allowed.
+ if (SubLoopsSize != 1)
+ return false;
+
+ L = L->getSubLoops()[0];
+ } while (L);
+
+ return true;
+}
+
+static Loop *getInnerMostLoop(Loop *L) {
+ while (!L->getSubLoops().empty())
+ L = L->getSubLoops()[0];
+ return L;
}
bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
- DependenceInfo &DI) {
+ DependenceInfo &DI, LoopInfo &LI) {
+ if (!isEligibleLoopForm(*L)) {
+ LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Ineligible loop form\n");
+ return false;
+ }
+
/* We currently handle outer loops like this:
|
- ForeFirst <----\ }
- Blocks | } ForeBlocks
- ForeLast | }
- | |
- SubLoopFirst <\ | }
- Blocks | | } SubLoopBlocks
- SubLoopLast -/ | }
- | |
- AftFirst | }
- Blocks | } AftBlocks
- AftLast ------/ }
+ ForeFirst <------\ }
+ Blocks | } ForeBlocks of L
+ ForeLast | }
+ | |
+ ... |
+ | |
+ ForeFirst <----\ | }
+ Blocks | | } ForeBlocks of a inner loop of L
+ ForeLast | | }
+ | | |
+ JamLoopFirst <\ | | }
+ Blocks | | | } JamLoopBlocks of the innermost loop
+ JamLoopLast -/ | | }
+ | | |
+ AftFirst | | }
+ Blocks | | } AftBlocks of a inner loop of L
+ AftLast ------/ | }
+ | |
+ ... |
+ | |
+ AftFirst | }
+ Blocks | } AftBlocks of L
+ AftLast --------/ }
|
There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
@@ -709,14 +893,16 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
things further in the profitablility checks of the unroll and jam pass.
Because of the way we rearrange basic blocks, we also require that
- the Fore blocks on all unrolled iterations are safe to move before the
- SubLoop blocks of all iterations. So we require that the phi node looping
- operands of ForeHeader can be moved to at least the end of ForeEnd, so that
- we can arrange cloned Fore Blocks before the subloop and match up Phi's
- correctly.
+ the Fore blocks of L on all unrolled iterations are safe to move before the
+ blocks of the direct child of L of all iterations. So we require that the
+ phi node looping operands of ForeHeader can be moved to at least the end of
+ ForeEnd, so that we can arrange cloned Fore Blocks before the subloop and
+ match up Phi's correctly.
- i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2.
- It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2.
+ i.e. The old order of blocks used to be
+ (F1)1 (F2)1 J1_1 J1_2 (A2)1 (A1)1 (F1)2 (F2)2 J2_1 J2_2 (A2)2 (A1)2.
+ It needs to be safe to transform this to
+ (F1)1 (F1)2 (F2)1 (F2)2 J1_1 J1_2 J2_1 J2_2 (A2)1 (A2)2 (A1)1 (A1)2.
There are then a number of checks along the lines of no calls, no
exceptions, inner loop IV is consistent, etc. Note that for loops requiring
@@ -724,35 +910,13 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
UnrollAndJamLoop if the trip count cannot be easily calculated.
*/
- if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
- return false;
- Loop *SubLoop = L->getSubLoops()[0];
- if (!SubLoop->isLoopSimplifyForm())
- return false;
-
- BasicBlock *Header = L->getHeader();
- BasicBlock *Latch = L->getLoopLatch();
- BasicBlock *Exit = L->getExitingBlock();
- BasicBlock *SubLoopHeader = SubLoop->getHeader();
- BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
- BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
-
- if (Latch != Exit)
- return false;
- if (SubLoopLatch != SubLoopExit)
- return false;
-
- if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken()) {
- LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n");
- return false;
- }
-
// Split blocks into Fore/SubLoop/Aft based on dominators
+ Loop *JamLoop = getInnerMostLoop(L);
BasicBlockSet SubLoopBlocks;
- BasicBlockSet ForeBlocks;
- BasicBlockSet AftBlocks;
- if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks,
- AftBlocks, &DT)) {
+ DenseMap<Loop *, BasicBlockSet> ForeBlocksMap;
+ DenseMap<Loop *, BasicBlockSet> AftBlocksMap;
+ if (!partitionOuterLoopBlocks(*L, *JamLoop, SubLoopBlocks, ForeBlocksMap,
+ AftBlocksMap, DT)) {
LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n");
return false;
}
@@ -760,7 +924,7 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
// Aft blocks may need to move instructions to fore blocks, which becomes more
// difficult if there are multiple (potentially conditionally executed)
// blocks. For now we just exclude loops with multiple aft blocks.
- if (AftBlocks.size() != 1) {
+ if (AftBlocksMap[L].size() != 1) {
LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle "
"multiple blocks after the loop\n");
return false;
@@ -768,7 +932,9 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
// Check inner loop backedge count is consistent on all iterations of the
// outer loop
- if (!hasIterationCountInvariantInParent(SubLoop, SE)) {
+ if (any_of(L->getLoopsInPreorder(), [&SE](Loop *SubLoop) {
+ return !hasIterationCountInvariantInParent(SubLoop, SE);
+ })) {
LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is "
"not consistent on each iteration\n");
return false;
@@ -789,6 +955,10 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
// ForeBlock phi operands before the subloop
// Make sure we can move all instructions we need to before the subloop
+ BasicBlock *Header = L->getHeader();
+ BasicBlock *Latch = L->getLoopLatch();
+ BasicBlockSet AftBlocks = AftBlocksMap[L];
+ Loop *SubLoop = L->getSubLoops()[0];
if (!processHeaderPhiOperands(
Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) {
if (SubLoop->contains(I->getParent()))
@@ -814,7 +984,8 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
// Check for memory dependencies which prohibit the unrolling we are doing.
// Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
// there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
- if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI)) {
+ if (!checkDependencies(*L, SubLoopBlocks, ForeBlocksMap, AftBlocksMap, DI,
+ LI)) {
LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n");
return false;
}