diff options
Diffstat (limited to 'llvm/lib/Transforms/Scalar/LoopFuse.cpp')
| -rw-r--r-- | llvm/lib/Transforms/Scalar/LoopFuse.cpp | 342 |
1 files changed, 303 insertions, 39 deletions
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp index d94b767c7b63..0eecec373736 100644 --- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -67,6 +67,7 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/CodeMoverUtils.h" #include "llvm/Transforms/Utils/LoopPeel.h" +#include "llvm/Transforms/Utils/LoopSimplify.h" using namespace llvm; @@ -101,6 +102,8 @@ STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block with " STATISTIC(NotRotated, "Candidate is not rotated"); STATISTIC(OnlySecondCandidateIsGuarded, "The second candidate is guarded while the first one is not"); +STATISTIC(NumHoistedInsts, "Number of hoisted preheader instructions."); +STATISTIC(NumSunkInsts, "Number of hoisted preheader instructions."); enum FusionDependenceAnalysisChoice { FUSION_DEPENDENCE_ANALYSIS_SCEV, @@ -183,9 +186,8 @@ struct FusionCandidate { OptimizationRemarkEmitter &ORE; - FusionCandidate(Loop *L, DominatorTree &DT, - const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE, - TTI::PeelingPreferences PP) + FusionCandidate(Loop *L, DominatorTree &DT, const PostDominatorTree *PDT, + OptimizationRemarkEmitter &ORE, TTI::PeelingPreferences PP) : Preheader(L->getLoopPreheader()), Header(L->getHeader()), ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()), Latch(L->getLoopLatch()), L(L), Valid(true), @@ -387,7 +389,13 @@ struct FusionCandidateCompare { /// Comparison functor to sort two Control Flow Equivalent fusion candidates /// into dominance order. /// If LHS dominates RHS and RHS post-dominates LHS, return true; - /// IF RHS dominates LHS and LHS post-dominates RHS, return false; + /// If RHS dominates LHS and LHS post-dominates RHS, return false; + /// If both LHS and RHS are not dominating each other then, non-strictly + /// post dominate check will decide the order of candidates. If RHS + /// non-strictly post dominates LHS then, return true. If LHS non-strictly + /// post dominates RHS then, return false. If both are non-strictly post + /// dominate each other then, level in the post dominator tree will decide + /// the order of candidates. bool operator()(const FusionCandidate &LHS, const FusionCandidate &RHS) const { const DominatorTree *DT = &(LHS.DT); @@ -413,9 +421,29 @@ struct FusionCandidateCompare { return true; } - // If LHS does not dominate RHS and RHS does not dominate LHS then there is - // no dominance relationship between the two FusionCandidates. Thus, they - // should not be in the same set together. + // If two FusionCandidates are in the same level of dominator tree, + // they will not dominate each other, but may still be control flow + // equivalent. To sort those FusionCandidates, nonStrictlyPostDominate() + // function is needed. + bool WrongOrder = + nonStrictlyPostDominate(LHSEntryBlock, RHSEntryBlock, DT, LHS.PDT); + bool RightOrder = + nonStrictlyPostDominate(RHSEntryBlock, LHSEntryBlock, DT, LHS.PDT); + if (WrongOrder && RightOrder) { + // If common predecessor of LHS and RHS post dominates both + // FusionCandidates then, Order of FusionCandidate can be + // identified by its level in post dominator tree. + DomTreeNode *LNode = LHS.PDT->getNode(LHSEntryBlock); + DomTreeNode *RNode = LHS.PDT->getNode(RHSEntryBlock); + return LNode->getLevel() > RNode->getLevel(); + } else if (WrongOrder) + return false; + else if (RightOrder) + return true; + + // If LHS does not non-strict Postdominate RHS and RHS does not non-strict + // Postdominate LHS then, there is no dominance relationship between the + // two FusionCandidates. Thus, they should not be in the same set together. llvm_unreachable( "No dominance relationship between these fusion candidates!"); } @@ -427,7 +455,7 @@ using LoopVector = SmallVector<Loop *, 4>; // order. Thus, if FC0 comes *before* FC1 in a FusionCandidateSet, then FC0 // dominates FC1 and FC1 post-dominates FC0. // std::set was chosen because we want a sorted data structure with stable -// iterators. A subsequent patch to loop fusion will enable fusing non-ajdacent +// iterators. A subsequent patch to loop fusion will enable fusing non-adjacent // loops by moving intervening code around. When this intervening code contains // loops, those loops will be moved also. The corresponding FusionCandidates // will also need to be moved accordingly. As this is done, having stable @@ -528,7 +556,7 @@ private: #ifndef NDEBUG static void printLoopVector(const LoopVector &LV) { dbgs() << "****************************\n"; - for (auto L : LV) + for (auto *L : LV) printLoop(*L, dbgs()); dbgs() << "****************************\n"; } @@ -549,7 +577,6 @@ private: PostDominatorTree &PDT; OptimizationRemarkEmitter &ORE; AssumptionCache &AC; - const TargetTransformInfo &TTI; public: @@ -644,7 +671,7 @@ private: void collectFusionCandidates(const LoopVector &LV) { for (Loop *L : LV) { TTI::PeelingPreferences PP = - gatherPeelingPreferences(L, SE, TTI, None, None); + gatherPeelingPreferences(L, SE, TTI, std::nullopt, std::nullopt); FusionCandidate CurrCand(L, DT, &PDT, ORE, PP); if (!CurrCand.isEligibleForFusion(SE)) continue; @@ -699,23 +726,22 @@ private: /// stating whether or not the two candidates are known at compile time to /// have the same TripCount. The second is the difference in the two /// TripCounts. This information can be used later to determine whether or not - /// peeling can be performed on either one of the candiates. - std::pair<bool, Optional<unsigned>> + /// peeling can be performed on either one of the candidates. + std::pair<bool, std::optional<unsigned>> haveIdenticalTripCounts(const FusionCandidate &FC0, const FusionCandidate &FC1) const { - const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L); if (isa<SCEVCouldNotCompute>(TripCount0)) { UncomputableTripCount++; LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!"); - return {false, None}; + return {false, std::nullopt}; } const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L); if (isa<SCEVCouldNotCompute>(TripCount1)) { UncomputableTripCount++; LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!"); - return {false, None}; + return {false, std::nullopt}; } LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & " @@ -740,10 +766,10 @@ private: LLVM_DEBUG(dbgs() << "Loop(s) do not have a single exit point or do not " "have a constant number of iterations. Peeling " "is not benefical\n"); - return {false, None}; + return {false, std::nullopt}; } - Optional<unsigned> Difference = None; + std::optional<unsigned> Difference; int Diff = TC0 - TC1; if (Diff > 0) @@ -767,7 +793,8 @@ private: LLVM_DEBUG(dbgs() << "Attempting to peel first " << PeelCount << " iterations of the first loop. \n"); - FC0.Peeled = peelLoop(FC0.L, PeelCount, &LI, &SE, DT, &AC, true); + ValueToValueMapTy VMap; + FC0.Peeled = peelLoop(FC0.L, PeelCount, &LI, &SE, DT, &AC, true, VMap); if (FC0.Peeled) { LLVM_DEBUG(dbgs() << "Done Peeling\n"); @@ -807,7 +834,7 @@ private: } // Cannot modify the predecessors inside the above loop as it will cause // the iterators to be nullptrs, causing memory errors. - for (Instruction *CurrentBranch: WorkList) { + for (Instruction *CurrentBranch : WorkList) { BasicBlock *Succ = CurrentBranch->getSuccessor(0); if (Succ == BB) Succ = CurrentBranch->getSuccessor(1); @@ -858,12 +885,12 @@ private: // Check if the candidates have identical tripcounts (first value of // pair), and if not check the difference in the tripcounts between // the loops (second value of pair). The difference is not equal to - // None iff the loops iterate a constant number of times, and have a - // single exit. - std::pair<bool, Optional<unsigned>> IdenticalTripCountRes = + // std::nullopt iff the loops iterate a constant number of times, and + // have a single exit. + std::pair<bool, std::optional<unsigned>> IdenticalTripCountRes = haveIdenticalTripCounts(*FC0, *FC1); bool SameTripCount = IdenticalTripCountRes.first; - Optional<unsigned> TCDifference = IdenticalTripCountRes.second; + std::optional<unsigned> TCDifference = IdenticalTripCountRes.second; // Here we are checking that FC0 (the first loop) can be peeled, and // both loops have different tripcounts. @@ -895,9 +922,10 @@ private: continue; } - if (!FC0->GuardBranch && FC1->GuardBranch) { - LLVM_DEBUG(dbgs() << "The second candidate is guarded while the " - "first one is not. Not fusing.\n"); + if ((!FC0->GuardBranch && FC1->GuardBranch) || + (FC0->GuardBranch && !FC1->GuardBranch)) { + LLVM_DEBUG(dbgs() << "The one of candidate is guarded while the " + "another one is not. Not fusing.\n"); reportLoopFusion<OptimizationRemarkMissed>( *FC0, *FC1, OnlySecondCandidateIsGuarded); continue; @@ -914,16 +942,6 @@ private: continue; } - if (!isSafeToMoveBefore(*FC1->Preheader, - *FC0->Preheader->getTerminator(), DT, &PDT, - &DI)) { - LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe " - "instructions in preheader. Not fusing.\n"); - reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, - NonEmptyPreheader); - continue; - } - if (FC0->GuardBranch) { assert(FC1->GuardBranch && "Expecting valid FC1 guard branch"); @@ -959,6 +977,31 @@ private: continue; } + // If the second loop has instructions in the pre-header, attempt to + // hoist them up to the first loop's pre-header or sink them into the + // body of the second loop. + SmallVector<Instruction *, 4> SafeToHoist; + SmallVector<Instruction *, 4> SafeToSink; + // At this point, this is the last remaining legality check. + // Which means if we can make this pre-header empty, we can fuse + // these loops + if (!isEmptyPreheader(*FC1)) { + LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty " + "preheader.\n"); + + // If it is not safe to hoist/sink all instructions in the + // pre-header, we cannot fuse these loops. + if (!collectMovablePreheaderInsts(*FC0, *FC1, SafeToHoist, + SafeToSink)) { + LLVM_DEBUG(dbgs() << "Could not hoist/sink all instructions in " + "Fusion Candidate Pre-header.\n" + << "Not Fusing.\n"); + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + NonEmptyPreheader); + continue; + } + } + bool BeneficialToFuse = isBeneficialFusion(*FC0, *FC1); LLVM_DEBUG(dbgs() << "\tFusion appears to be " @@ -972,6 +1015,9 @@ private: // and profitable. At this point, start transforming the code and // perform fusion. + // Execute the hoist/sink operations on preheader instructions + movePreheaderInsts(*FC0, *FC1, SafeToHoist, SafeToSink); + LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and " << *FC1 << "\n"); @@ -1022,6 +1068,170 @@ private: return Fused; } + // Returns true if the instruction \p I can be hoisted to the end of the + // preheader of \p FC0. \p SafeToHoist contains the instructions that are + // known to be safe to hoist. The instructions encountered that cannot be + // hoisted are in \p NotHoisting. + // TODO: Move functionality into CodeMoverUtils + bool canHoistInst(Instruction &I, + const SmallVector<Instruction *, 4> &SafeToHoist, + const SmallVector<Instruction *, 4> &NotHoisting, + const FusionCandidate &FC0) const { + const BasicBlock *FC0PreheaderTarget = FC0.Preheader->getSingleSuccessor(); + assert(FC0PreheaderTarget && + "Expected single successor for loop preheader."); + + for (Use &Op : I.operands()) { + if (auto *OpInst = dyn_cast<Instruction>(Op)) { + bool OpHoisted = is_contained(SafeToHoist, OpInst); + // Check if we have already decided to hoist this operand. In this + // case, it does not dominate FC0 *yet*, but will after we hoist it. + if (!(OpHoisted || DT.dominates(OpInst, FC0PreheaderTarget))) { + return false; + } + } + } + + // PHIs in FC1's header only have FC0 blocks as predecessors. PHIs + // cannot be hoisted and should be sunk to the exit of the fused loop. + if (isa<PHINode>(I)) + return false; + + // If this isn't a memory inst, hoisting is safe + if (!I.mayReadOrWriteMemory()) + return true; + + LLVM_DEBUG(dbgs() << "Checking if this mem inst can be hoisted.\n"); + for (Instruction *NotHoistedInst : NotHoisting) { + if (auto D = DI.depends(&I, NotHoistedInst, true)) { + // Dependency is not read-before-write, write-before-read or + // write-before-write + if (D->isFlow() || D->isAnti() || D->isOutput()) { + LLVM_DEBUG(dbgs() << "Inst depends on an instruction in FC1's " + "preheader that is not being hoisted.\n"); + return false; + } + } + } + + for (Instruction *ReadInst : FC0.MemReads) { + if (auto D = DI.depends(ReadInst, &I, true)) { + // Dependency is not read-before-write + if (D->isAnti()) { + LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC0.\n"); + return false; + } + } + } + + for (Instruction *WriteInst : FC0.MemWrites) { + if (auto D = DI.depends(WriteInst, &I, true)) { + // Dependency is not write-before-read or write-before-write + if (D->isFlow() || D->isOutput()) { + LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC0.\n"); + return false; + } + } + } + return true; + } + + // Returns true if the instruction \p I can be sunk to the top of the exit + // block of \p FC1. + // TODO: Move functionality into CodeMoverUtils + bool canSinkInst(Instruction &I, const FusionCandidate &FC1) const { + for (User *U : I.users()) { + if (auto *UI{dyn_cast<Instruction>(U)}) { + // Cannot sink if user in loop + // If FC1 has phi users of this value, we cannot sink it into FC1. + if (FC1.L->contains(UI)) { + // Cannot hoist or sink this instruction. No hoisting/sinking + // should take place, loops should not fuse + return false; + } + } + } + + // If this isn't a memory inst, sinking is safe + if (!I.mayReadOrWriteMemory()) + return true; + + for (Instruction *ReadInst : FC1.MemReads) { + if (auto D = DI.depends(&I, ReadInst, true)) { + // Dependency is not write-before-read + if (D->isFlow()) { + LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC1.\n"); + return false; + } + } + } + + for (Instruction *WriteInst : FC1.MemWrites) { + if (auto D = DI.depends(&I, WriteInst, true)) { + // Dependency is not write-before-write or read-before-write + if (D->isOutput() || D->isAnti()) { + LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC1.\n"); + return false; + } + } + } + + return true; + } + + /// Collect instructions in the \p FC1 Preheader that can be hoisted + /// to the \p FC0 Preheader or sunk into the \p FC1 Body + bool collectMovablePreheaderInsts( + const FusionCandidate &FC0, const FusionCandidate &FC1, + SmallVector<Instruction *, 4> &SafeToHoist, + SmallVector<Instruction *, 4> &SafeToSink) const { + BasicBlock *FC1Preheader = FC1.Preheader; + // Save the instructions that are not being hoisted, so we know not to hoist + // mem insts that they dominate. + SmallVector<Instruction *, 4> NotHoisting; + + for (Instruction &I : *FC1Preheader) { + // Can't move a branch + if (&I == FC1Preheader->getTerminator()) + continue; + // If the instruction has side-effects, give up. + // TODO: The case of mayReadFromMemory we can handle but requires + // additional work with a dependence analysis so for now we give + // up on memory reads. + if (I.mayThrow() || !I.willReturn()) { + LLVM_DEBUG(dbgs() << "Inst: " << I << " may throw or won't return.\n"); + return false; + } + + LLVM_DEBUG(dbgs() << "Checking Inst: " << I << "\n"); + + if (I.isAtomic() || I.isVolatile()) { + LLVM_DEBUG( + dbgs() << "\tInstruction is volatile or atomic. Cannot move it.\n"); + return false; + } + + if (canHoistInst(I, SafeToHoist, NotHoisting, FC0)) { + SafeToHoist.push_back(&I); + LLVM_DEBUG(dbgs() << "\tSafe to hoist.\n"); + } else { + LLVM_DEBUG(dbgs() << "\tCould not hoist. Trying to sink...\n"); + NotHoisting.push_back(&I); + + if (canSinkInst(I, FC1)) { + SafeToSink.push_back(&I); + LLVM_DEBUG(dbgs() << "\tSafe to sink.\n"); + } else { + LLVM_DEBUG(dbgs() << "\tCould not sink.\n"); + return false; + } + } + } + LLVM_DEBUG( + dbgs() << "All preheader instructions could be sunk or hoisted!\n"); + return true; + } + /// Rewrite all additive recurrences in a SCEV to use a new loop. class AddRecLoopReplacer : public SCEVRewriteVisitor<AddRecLoopReplacer> { public: @@ -1034,7 +1244,7 @@ private: const Loop *ExprL = Expr->getLoop(); SmallVector<const SCEV *, 2> Operands; if (ExprL == &OldL) { - Operands.append(Expr->op_begin(), Expr->op_end()); + append_range(Operands, Expr->operands()); return SE.getAddRecExpr(Operands, &NewL, Expr->getNoWrapFlags()); } @@ -1235,6 +1445,46 @@ private: return FC0.ExitBlock == FC1.getEntryBlock(); } + bool isEmptyPreheader(const FusionCandidate &FC) const { + return FC.Preheader->size() == 1; + } + + /// Hoist \p FC1 Preheader instructions to \p FC0 Preheader + /// and sink others into the body of \p FC1. + void movePreheaderInsts(const FusionCandidate &FC0, + const FusionCandidate &FC1, + SmallVector<Instruction *, 4> &HoistInsts, + SmallVector<Instruction *, 4> &SinkInsts) const { + // All preheader instructions except the branch must be hoisted or sunk + assert(HoistInsts.size() + SinkInsts.size() == FC1.Preheader->size() - 1 && + "Attempting to sink and hoist preheader instructions, but not all " + "the preheader instructions are accounted for."); + + NumHoistedInsts += HoistInsts.size(); + NumSunkInsts += SinkInsts.size(); + + LLVM_DEBUG(if (VerboseFusionDebugging) { + if (!HoistInsts.empty()) + dbgs() << "Hoisting: \n"; + for (Instruction *I : HoistInsts) + dbgs() << *I << "\n"; + if (!SinkInsts.empty()) + dbgs() << "Sinking: \n"; + for (Instruction *I : SinkInsts) + dbgs() << *I << "\n"; + }); + + for (Instruction *I : HoistInsts) { + assert(I->getParent() == FC1.Preheader); + I->moveBefore(FC0.Preheader->getTerminator()); + } + // insert instructions in reverse order to maintain dominance relationship + for (Instruction *I : reverse(SinkInsts)) { + assert(I->getParent() == FC1.Preheader); + I->moveBefore(&*FC1.ExitBlock->getFirstInsertionPt()); + } + } + /// Determine if two fusion candidates have identical guards /// /// This method will determine if two fusion candidates have the same guards. @@ -1480,6 +1730,7 @@ private: // mergeLatch may remove the only block in FC1. SE.forgetLoop(FC1.L); SE.forgetLoop(FC0.L); + SE.forgetLoopDispositions(); // Move instructions from FC0.Latch to FC1.Latch. // Note: mergeLatch requires an updated DT. @@ -1772,6 +2023,7 @@ private: // mergeLatch may remove the only block in FC1. SE.forgetLoop(FC1.L); SE.forgetLoop(FC0.L); + SE.forgetLoopDispositions(); // Move instructions from FC0.Latch to FC1.Latch. // Note: mergeLatch requires an updated DT. @@ -1838,6 +2090,7 @@ struct LoopFuseLegacy : public FunctionPass { bool runOnFunction(Function &F) override { if (skipFunction(F)) return false; + auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI(); @@ -1866,8 +2119,19 @@ PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) { const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); const DataLayout &DL = F.getParent()->getDataLayout(); + // Ensure loops are in simplifed form which is a pre-requisite for loop fusion + // pass. Added only for new PM since the legacy PM has already added + // LoopSimplify pass as a dependency. + bool Changed = false; + for (auto &L : LI) { + Changed |= + simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */); + } + if (Changed) + PDT.recalculate(F); + LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI); - bool Changed = LF.fuseLoops(F); + Changed |= LF.fuseLoops(F); if (!Changed) return PreservedAnalyses::all(); |
