diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2020-01-22 20:31:01 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2020-01-22 20:31:01 +0000 |
commit | 8bcb0991864975618c09697b1aca10683346d9f0 (patch) | |
tree | 0afab28faa50e5f27698f8dd6c1921fff8d25e39 /contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp | |
parent | b14637d118e110006a149a79b649c5695e7f419a (diff) | |
parent | 1d5ae1026e831016fc29fd927877c86af904481f (diff) |
Notes
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp | 640 |
1 files changed, 530 insertions, 110 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp index 0bc2bcff2ae1..9f93c68e6128 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -66,7 +66,7 @@ using namespace llvm; #define DEBUG_TYPE "loop-fusion" -STATISTIC(FuseCounter, "Count number of loop fusions performed"); +STATISTIC(FuseCounter, "Loops fused"); STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion"); STATISTIC(InvalidPreheader, "Loop has invalid preheader"); STATISTIC(InvalidHeader, "Loop has invalid header"); @@ -79,12 +79,15 @@ STATISTIC(MayThrowException, "Loop may throw an exception"); STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access"); STATISTIC(NotSimplifiedForm, "Loop is not in simplified form"); STATISTIC(InvalidDependencies, "Dependencies prevent fusion"); -STATISTIC(InvalidTripCount, - "Loop does not have invariant backedge taken count"); +STATISTIC(UnknownTripCount, "Loop has unknown trip count"); STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop"); -STATISTIC(NonEqualTripCount, "Candidate trip counts are not the same"); -STATISTIC(NonAdjacent, "Candidates are not adjacent"); -STATISTIC(NonEmptyPreheader, "Candidate has a non-empty preheader"); +STATISTIC(NonEqualTripCount, "Loop trip counts are not the same"); +STATISTIC(NonAdjacent, "Loops are not adjacent"); +STATISTIC(NonEmptyPreheader, "Loop has a non-empty preheader"); +STATISTIC(FusionNotBeneficial, "Fusion is not beneficial"); +STATISTIC(NonIdenticalGuards, "Candidates have different guards"); +STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block"); +STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block"); enum FusionDependenceAnalysisChoice { FUSION_DEPENDENCE_ANALYSIS_SCEV, @@ -110,6 +113,7 @@ static cl::opt<bool> cl::Hidden, cl::init(false), cl::ZeroOrMore); #endif +namespace { /// This class is used to represent a candidate for loop fusion. When it is /// constructed, it checks the conditions for loop fusion to ensure that it /// represents a valid candidate. It caches several parts of a loop that are @@ -143,6 +147,8 @@ struct FusionCandidate { SmallVector<Instruction *, 16> MemWrites; /// Are all of the members of this fusion candidate still valid bool Valid; + /// Guard branch of the loop, if it exists + BranchInst *GuardBranch; /// Dominator and PostDominator trees are needed for the /// FusionCandidateCompare function, required by FusionCandidateSet to @@ -151,11 +157,20 @@ struct FusionCandidate { const DominatorTree *DT; const PostDominatorTree *PDT; + OptimizationRemarkEmitter &ORE; + FusionCandidate(Loop *L, const DominatorTree *DT, - const PostDominatorTree *PDT) + const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE) : Preheader(L->getLoopPreheader()), Header(L->getHeader()), ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()), - Latch(L->getLoopLatch()), L(L), Valid(true), DT(DT), PDT(PDT) { + Latch(L->getLoopLatch()), L(L), Valid(true), GuardBranch(nullptr), + DT(DT), PDT(PDT), ORE(ORE) { + + // TODO: This is temporary while we fuse both rotated and non-rotated + // loops. Once we switch to only fusing rotated loops, the initialization of + // GuardBranch can be moved into the initialization list above. + if (isRotated()) + GuardBranch = L->getLoopGuardBranch(); // Walk over all blocks in the loop and check for conditions that may // prevent fusion. For each block, walk over all instructions and collect @@ -163,28 +178,28 @@ struct FusionCandidate { // found, invalidate this object and return. for (BasicBlock *BB : L->blocks()) { if (BB->hasAddressTaken()) { - AddressTakenBB++; invalidate(); + reportInvalidCandidate(AddressTakenBB); return; } for (Instruction &I : *BB) { if (I.mayThrow()) { - MayThrowException++; invalidate(); + reportInvalidCandidate(MayThrowException); return; } if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { if (SI->isVolatile()) { - ContainsVolatileAccess++; invalidate(); + reportInvalidCandidate(ContainsVolatileAccess); return; } } if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { if (LI->isVolatile()) { - ContainsVolatileAccess++; invalidate(); + reportInvalidCandidate(ContainsVolatileAccess); return; } } @@ -214,19 +229,96 @@ struct FusionCandidate { assert(Latch == L->getLoopLatch() && "Latch is out of sync"); } + /// Get the entry block for this fusion candidate. + /// + /// If this fusion candidate represents a guarded loop, the entry block is the + /// loop guard block. If it represents an unguarded loop, the entry block is + /// the preheader of the loop. + BasicBlock *getEntryBlock() const { + if (GuardBranch) + return GuardBranch->getParent(); + else + return Preheader; + } + + /// Given a guarded loop, get the successor of the guard that is not in the + /// loop. + /// + /// This method returns the successor of the loop guard that is not located + /// within the loop (i.e., the successor of the guard that is not the + /// preheader). + /// This method is only valid for guarded loops. + BasicBlock *getNonLoopBlock() const { + assert(GuardBranch && "Only valid on guarded loops."); + assert(GuardBranch->isConditional() && + "Expecting guard to be a conditional branch."); + return (GuardBranch->getSuccessor(0) == Preheader) + ? GuardBranch->getSuccessor(1) + : GuardBranch->getSuccessor(0); + } + + bool isRotated() const { + assert(L && "Expecting loop to be valid."); + assert(Latch && "Expecting latch to be valid."); + return L->isLoopExiting(Latch); + } + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void dump() const { - dbgs() << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr") + dbgs() << "\tGuardBranch: " + << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n" + << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr") << "\n" << "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n" << "\tExitingBB: " << (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n" << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr") << "\n" - << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n"; + << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n" + << "\tEntryBlock: " + << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr") + << "\n"; } #endif + /// Determine if a fusion candidate (representing a loop) is eligible for + /// fusion. Note that this only checks whether a single loop can be fused - it + /// does not check whether it is *legal* to fuse two loops together. + bool isEligibleForFusion(ScalarEvolution &SE) const { + if (!isValid()) { + LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n"); + if (!Preheader) + ++InvalidPreheader; + if (!Header) + ++InvalidHeader; + if (!ExitingBlock) + ++InvalidExitingBlock; + if (!ExitBlock) + ++InvalidExitBlock; + if (!Latch) + ++InvalidLatch; + if (L->isInvalid()) + ++InvalidLoop; + + return false; + } + + // Require ScalarEvolution to be able to determine a trip count. + if (!SE.hasLoopInvariantBackedgeTakenCount(L)) { + LLVM_DEBUG(dbgs() << "Loop " << L->getName() + << " trip count not computable!\n"); + return reportInvalidCandidate(UnknownTripCount); + } + + if (!L->isLoopSimplifyForm()) { + LLVM_DEBUG(dbgs() << "Loop " << L->getName() + << " is not in simplified form!\n"); + return reportInvalidCandidate(NotSimplifiedForm); + } + + return true; + } + private: // This is only used internally for now, to clear the MemWrites and MemReads // list and setting Valid to false. I can't envision other uses of this right @@ -239,17 +331,18 @@ private: MemReads.clear(); Valid = false; } -}; -inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, - const FusionCandidate &FC) { - if (FC.isValid()) - OS << FC.Preheader->getName(); - else - OS << "<Invalid>"; - - return OS; -} + bool reportInvalidCandidate(llvm::Statistic &Stat) const { + using namespace ore; + assert(L && Preheader && "Fusion candidate not initialized properly!"); + ++Stat; + ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(), + L->getStartLoc(), Preheader) + << "[" << Preheader->getParent()->getName() << "]: " + << "Loop is not a candidate for fusion: " << Stat.getDesc()); + return false; + } +}; struct FusionCandidateCompare { /// Comparison functor to sort two Control Flow Equivalent fusion candidates @@ -260,21 +353,24 @@ struct FusionCandidateCompare { const FusionCandidate &RHS) const { const DominatorTree *DT = LHS.DT; + BasicBlock *LHSEntryBlock = LHS.getEntryBlock(); + BasicBlock *RHSEntryBlock = RHS.getEntryBlock(); + // Do not save PDT to local variable as it is only used in asserts and thus // will trigger an unused variable warning if building without asserts. assert(DT && LHS.PDT && "Expecting valid dominator tree"); // Do this compare first so if LHS == RHS, function returns false. - if (DT->dominates(RHS.Preheader, LHS.Preheader)) { + if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) { // RHS dominates LHS // Verify LHS post-dominates RHS - assert(LHS.PDT->dominates(LHS.Preheader, RHS.Preheader)); + assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock)); return false; } - if (DT->dominates(LHS.Preheader, RHS.Preheader)) { + if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) { // Verify RHS Postdominates LHS - assert(LHS.PDT->dominates(RHS.Preheader, LHS.Preheader)); + assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock)); return true; } @@ -286,7 +382,6 @@ struct FusionCandidateCompare { } }; -namespace { using LoopVector = SmallVector<Loop *, 4>; // Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance @@ -301,17 +396,26 @@ using LoopVector = SmallVector<Loop *, 4>; // keeps the FusionCandidateSet sorted will also simplify the implementation. using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>; using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>; -} // namespace -inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, +#if !defined(NDEBUG) +static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, + const FusionCandidate &FC) { + if (FC.isValid()) + OS << FC.Preheader->getName(); + else + OS << "<Invalid>"; + + return OS; +} + +static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FusionCandidateSet &CandSet) { - for (auto IT : CandSet) - OS << IT << "\n"; + for (const FusionCandidate &FC : CandSet) + OS << FC << '\n'; return OS; } -#if !defined(NDEBUG) static void printFusionCandidates(const FusionCandidateCollection &FusionCandidates) { dbgs() << "Fusion Candidates: \n"; @@ -391,16 +495,6 @@ static void printLoopVector(const LoopVector &LV) { } #endif -static void reportLoopFusion(const FusionCandidate &FC0, - const FusionCandidate &FC1, - OptimizationRemarkEmitter &ORE) { - using namespace ore; - ORE.emit( - OptimizationRemark(DEBUG_TYPE, "LoopFusion", FC0.Preheader->getParent()) - << "Fused " << NV("Cand1", StringRef(FC0.Preheader->getName())) - << " with " << NV("Cand2", StringRef(FC1.Preheader->getName()))); -} - struct LoopFuser { private: // Sets of control flow equivalent fusion candidates for a given nest level. @@ -497,53 +591,16 @@ private: const FusionCandidate &FC1) const { assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders"); - if (DT.dominates(FC0.Preheader, FC1.Preheader)) - return PDT.dominates(FC1.Preheader, FC0.Preheader); + BasicBlock *FC0EntryBlock = FC0.getEntryBlock(); + BasicBlock *FC1EntryBlock = FC1.getEntryBlock(); - if (DT.dominates(FC1.Preheader, FC0.Preheader)) - return PDT.dominates(FC0.Preheader, FC1.Preheader); + if (DT.dominates(FC0EntryBlock, FC1EntryBlock)) + return PDT.dominates(FC1EntryBlock, FC0EntryBlock); - return false; - } - - /// Determine if a fusion candidate (representing a loop) is eligible for - /// fusion. Note that this only checks whether a single loop can be fused - it - /// does not check whether it is *legal* to fuse two loops together. - bool eligibleForFusion(const FusionCandidate &FC) const { - if (!FC.isValid()) { - LLVM_DEBUG(dbgs() << "FC " << FC << " has invalid CFG requirements!\n"); - if (!FC.Preheader) - InvalidPreheader++; - if (!FC.Header) - InvalidHeader++; - if (!FC.ExitingBlock) - InvalidExitingBlock++; - if (!FC.ExitBlock) - InvalidExitBlock++; - if (!FC.Latch) - InvalidLatch++; - if (FC.L->isInvalid()) - InvalidLoop++; + if (DT.dominates(FC1EntryBlock, FC0EntryBlock)) + return PDT.dominates(FC0EntryBlock, FC1EntryBlock); - return false; - } - - // Require ScalarEvolution to be able to determine a trip count. - if (!SE.hasLoopInvariantBackedgeTakenCount(FC.L)) { - LLVM_DEBUG(dbgs() << "Loop " << FC.L->getName() - << " trip count not computable!\n"); - InvalidTripCount++; - return false; - } - - if (!FC.L->isLoopSimplifyForm()) { - LLVM_DEBUG(dbgs() << "Loop " << FC.L->getName() - << " is not in simplified form!\n"); - NotSimplifiedForm++; - return false; - } - - return true; + return false; } /// Iterate over all loops in the given loop set and identify the loops that @@ -551,8 +608,8 @@ private: /// Flow Equivalent sets, sorted by dominance. void collectFusionCandidates(const LoopVector &LV) { for (Loop *L : LV) { - FusionCandidate CurrCand(L, &DT, &PDT); - if (!eligibleForFusion(CurrCand)) + FusionCandidate CurrCand(L, &DT, &PDT, ORE); + if (!CurrCand.isEligibleForFusion(SE)) continue; // Go through each list in FusionCandidates and determine if L is control @@ -664,31 +721,64 @@ private: if (!identicalTripCounts(*FC0, *FC1)) { LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip " "counts. Not fusing.\n"); - NonEqualTripCount++; + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + NonEqualTripCount); continue; } if (!isAdjacent(*FC0, *FC1)) { LLVM_DEBUG(dbgs() << "Fusion candidates are not adjacent. Not fusing.\n"); - NonAdjacent++; + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent); continue; } - // For now we skip fusing if the second candidate has any instructions - // in the preheader. This is done because we currently do not have the - // safety checks to determine if it is save to move the preheader of - // the second candidate past the body of the first candidate. Once - // these checks are added, this condition can be removed. + // Ensure that FC0 and FC1 have identical guards. + // If one (or both) are not guarded, this check is not necessary. + if (FC0->GuardBranch && FC1->GuardBranch && + !haveIdenticalGuards(*FC0, *FC1)) { + LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical " + "guards. Not Fusing.\n"); + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + NonIdenticalGuards); + continue; + } + + // The following three checks look for empty blocks in FC0 and FC1. If + // any of these blocks are non-empty, we do not fuse. This is done + // because we currently do not have the safety checks to determine if + // it is safe to move the blocks past other blocks in the loop. Once + // these checks are added, these conditions can be relaxed. if (!isEmptyPreheader(*FC1)) { LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty " "preheader. Not fusing.\n"); - NonEmptyPreheader++; + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + NonEmptyPreheader); + continue; + } + + if (FC0->GuardBranch && !isEmptyExitBlock(*FC0)) { + LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty exit " + "block. Not fusing.\n"); + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + NonEmptyExitBlock); + continue; + } + + if (FC1->GuardBranch && !isEmptyGuardBlock(*FC1)) { + LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty guard " + "block. Not fusing.\n"); + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + NonEmptyGuardBlock); continue; } + // Check the dependencies across the loops and do not fuse if it would + // violate them. if (!dependencesAllowFusion(*FC0, *FC1)) { LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n"); + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + InvalidDependencies); continue; } @@ -696,9 +786,11 @@ private: LLVM_DEBUG(dbgs() << "\tFusion appears to be " << (BeneficialToFuse ? "" : "un") << "profitable!\n"); - if (!BeneficialToFuse) + if (!BeneficialToFuse) { + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + FusionNotBeneficial); continue; - + } // All analysis has completed and has determined that fusion is legal // and profitable. At this point, start transforming the code and // perform fusion. @@ -710,15 +802,14 @@ private: // Note this needs to be done *before* performFusion because // performFusion will change the original loops, making it not // possible to identify them after fusion is complete. - reportLoopFusion(*FC0, *FC1, ORE); + reportLoopFusion<OptimizationRemark>(*FC0, *FC1, FuseCounter); - FusionCandidate FusedCand(performFusion(*FC0, *FC1), &DT, &PDT); + FusionCandidate FusedCand(performFusion(*FC0, *FC1), &DT, &PDT, ORE); FusedCand.verify(); - assert(eligibleForFusion(FusedCand) && + assert(FusedCand.isEligibleForFusion(SE) && "Fused candidate should be eligible for fusion!"); // Notify the loop-depth-tree that these loops are not valid objects - // anymore. LDT.removeLoop(FC1->L); CandidateSet.erase(FC0); @@ -889,7 +980,7 @@ private: LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1 << "\n"); assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth()); - assert(DT.dominates(FC0.Preheader, FC1.Preheader)); + assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock())); for (Instruction *WriteL0 : FC0.MemWrites) { for (Instruction *WriteL1 : FC1.MemWrites) @@ -939,18 +1030,89 @@ private: return true; } - /// Determine if the exit block of \p FC0 is the preheader of \p FC1. In this - /// case, there is no code in between the two fusion candidates, thus making - /// them adjacent. + /// Determine if two fusion candidates are adjacent in the CFG. + /// + /// This method will determine if there are additional basic blocks in the CFG + /// between the exit of \p FC0 and the entry of \p FC1. + /// If the two candidates are guarded loops, then it checks whether the + /// non-loop successor of the \p FC0 guard branch is the entry block of \p + /// FC1. If not, then the loops are not adjacent. If the two candidates are + /// not guarded loops, then it checks whether the exit block of \p FC0 is the + /// preheader of \p FC1. bool isAdjacent(const FusionCandidate &FC0, const FusionCandidate &FC1) const { - return FC0.ExitBlock == FC1.Preheader; + // If the successor of the guard branch is FC1, then the loops are adjacent + if (FC0.GuardBranch) + return FC0.getNonLoopBlock() == FC1.getEntryBlock(); + else + return FC0.ExitBlock == FC1.getEntryBlock(); + } + + /// Determine if two fusion candidates have identical guards + /// + /// This method will determine if two fusion candidates have the same guards. + /// The guards are considered the same if: + /// 1. The instructions to compute the condition used in the compare are + /// identical. + /// 2. The successors of the guard have the same flow into/around the loop. + /// If the compare instructions are identical, then the first successor of the + /// guard must go to the same place (either the preheader of the loop or the + /// NonLoopBlock). In other words, the the first successor of both loops must + /// both go into the loop (i.e., the preheader) or go around the loop (i.e., + /// the NonLoopBlock). The same must be true for the second successor. + bool haveIdenticalGuards(const FusionCandidate &FC0, + const FusionCandidate &FC1) const { + assert(FC0.GuardBranch && FC1.GuardBranch && + "Expecting FC0 and FC1 to be guarded loops."); + + if (auto FC0CmpInst = + dyn_cast<Instruction>(FC0.GuardBranch->getCondition())) + if (auto FC1CmpInst = + dyn_cast<Instruction>(FC1.GuardBranch->getCondition())) + if (!FC0CmpInst->isIdenticalTo(FC1CmpInst)) + return false; + + // The compare instructions are identical. + // Now make sure the successor of the guards have the same flow into/around + // the loop + if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader) + return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader); + else + return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader); + } + + /// Check that the guard for \p FC *only* contains the cmp/branch for the + /// guard. + /// Once we are able to handle intervening code, any code in the guard block + /// for FC1 will need to be treated as intervening code and checked whether + /// it can safely move around the loops. + bool isEmptyGuardBlock(const FusionCandidate &FC) const { + assert(FC.GuardBranch && "Expecting a fusion candidate with guard branch."); + if (auto *CmpInst = dyn_cast<Instruction>(FC.GuardBranch->getCondition())) { + auto *GuardBlock = FC.GuardBranch->getParent(); + // If the generation of the cmp value is in GuardBlock, then the size of + // the guard block should be 2 (cmp + branch). If the generation of the + // cmp value is in a different block, then the size of the guard block + // should only be 1. + if (CmpInst->getParent() == GuardBlock) + return GuardBlock->size() == 2; + else + return GuardBlock->size() == 1; + } + + return false; } bool isEmptyPreheader(const FusionCandidate &FC) const { + assert(FC.Preheader && "Expecting a valid preheader"); return FC.Preheader->size() == 1; } + bool isEmptyExitBlock(const FusionCandidate &FC) const { + assert(FC.ExitBlock && "Expecting a valid exit block"); + return FC.ExitBlock->size() == 1; + } + /// Fuse two fusion candidates, creating a new fused loop. /// /// This method contains the mechanics of fusing two loops, represented by \p @@ -987,6 +1149,12 @@ private: LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump(); dbgs() << "Fusion Candidate 1: \n"; FC1.dump();); + // Fusing guarded loops is handled slightly differently than non-guarded + // loops and has been broken out into a separate method instead of trying to + // intersperse the logic within a single method. + if (FC0.GuardBranch) + return fuseGuardedLoops(FC0, FC1); + assert(FC1.Preheader == FC0.ExitBlock); assert(FC1.Preheader->size() == 1 && FC1.Preheader->getSingleSuccessor() == FC1.Header); @@ -1131,7 +1299,258 @@ private: SE.verify(); #endif - FuseCounter++; + LLVM_DEBUG(dbgs() << "Fusion done:\n"); + + return FC0.L; + } + + /// Report details on loop fusion opportunities. + /// + /// This template function can be used to report both successful and missed + /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should + /// be one of: + /// - OptimizationRemarkMissed to report when loop fusion is unsuccessful + /// given two valid fusion candidates. + /// - OptimizationRemark to report successful fusion of two fusion + /// candidates. + /// The remarks will be printed using the form: + /// <path/filename>:<line number>:<column number>: [<function name>]: + /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description> + template <typename RemarkKind> + void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1, + llvm::Statistic &Stat) { + assert(FC0.Preheader && FC1.Preheader && + "Expecting valid fusion candidates"); + using namespace ore; + ++Stat; + ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(), + FC0.Preheader) + << "[" << FC0.Preheader->getParent()->getName() + << "]: " << NV("Cand1", StringRef(FC0.Preheader->getName())) + << " and " << NV("Cand2", StringRef(FC1.Preheader->getName())) + << ": " << Stat.getDesc()); + } + + /// Fuse two guarded fusion candidates, creating a new fused loop. + /// + /// Fusing guarded loops is handled much the same way as fusing non-guarded + /// loops. The rewiring of the CFG is slightly different though, because of + /// the presence of the guards around the loops and the exit blocks after the + /// loop body. As such, the new loop is rewired as follows: + /// 1. Keep the guard branch from FC0 and use the non-loop block target + /// from the FC1 guard branch. + /// 2. Remove the exit block from FC0 (this exit block should be empty + /// right now). + /// 3. Remove the guard branch for FC1 + /// 4. Remove the preheader for FC1. + /// The exit block successor for the latch of FC0 is updated to be the header + /// of FC1 and the non-exit block successor of the latch of FC1 is updated to + /// be the header of FC0, thus creating the fused loop. + Loop *fuseGuardedLoops(const FusionCandidate &FC0, + const FusionCandidate &FC1) { + assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops"); + + BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent(); + BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent(); + BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock(); + BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock(); + + assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent"); + + SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; + + //////////////////////////////////////////////////////////////////////////// + // Update the Loop Guard + //////////////////////////////////////////////////////////////////////////// + // The guard for FC0 is updated to guard both FC0 and FC1. This is done by + // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1. + // Thus, one path from the guard goes to the preheader for FC0 (and thus + // executes the new fused loop) and the other path goes to the NonLoopBlock + // for FC1 (where FC1 guard would have gone if FC1 was not executed). + FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock); + FC0.ExitBlock->getTerminator()->replaceUsesOfWith(FC1GuardBlock, + FC1.Header); + + // The guard of FC1 is not necessary anymore. + FC1.GuardBranch->eraseFromParent(); + new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock); + + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC1GuardBlock, FC1.Preheader)); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock)); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock)); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock)); + + assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) && + "Expecting guard block to have no predecessors"); + assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) && + "Expecting guard block to have no successors"); + + // Remember the phi nodes originally in the header of FC0 in order to rewire + // them later. However, this is only necessary if the new loop carried + // values might not dominate the exiting branch. While we do not generally + // test if this is the case but simply insert intermediate phi nodes, we + // need to make sure these intermediate phi nodes have different + // predecessors. To this end, we filter the special case where the exiting + // block is the latch block of the first loop. Nothing needs to be done + // anyway as all loop carried values dominate the latch and thereby also the + // exiting branch. + // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch + // (because the loops are rotated. Thus, nothing will ever be added to + // OriginalFC0PHIs. + SmallVector<PHINode *, 8> OriginalFC0PHIs; + if (FC0.ExitingBlock != FC0.Latch) + for (PHINode &PHI : FC0.Header->phis()) + OriginalFC0PHIs.push_back(&PHI); + + assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!"); + + // Replace incoming blocks for header PHIs first. + FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader); + FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch); + + // The old exiting block of the first loop (FC0) has to jump to the header + // of the second as we need to execute the code in the second header block + // regardless of the trip count. That is, if the trip count is 0, so the + // back edge is never taken, we still have to execute both loop headers, + // especially (but not only!) if the second is a do-while style loop. + // However, doing so might invalidate the phi nodes of the first loop as + // the new values do only need to dominate their latch and not the exiting + // predicate. To remedy this potential problem we always introduce phi + // nodes in the header of the second loop later that select the loop carried + // value, if the second header was reached through an old latch of the + // first, or undef otherwise. This is sound as exiting the first implies the + // second will exit too, __without__ taking the back-edge (their + // trip-counts are equal after all). + FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock, + FC1.Header); + + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); + + // Remove FC0 Exit Block + // The exit block for FC0 is no longer needed since control will flow + // directly to the header of FC1. Since it is an empty block, it can be + // removed at this point. + // TODO: In the future, we can handle non-empty exit blocks my merging any + // instructions from FC0 exit block into FC1 exit block prior to removing + // the block. + assert(pred_begin(FC0.ExitBlock) == pred_end(FC0.ExitBlock) && + "Expecting exit block to be empty"); + FC0.ExitBlock->getTerminator()->eraseFromParent(); + new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); + + // Remove FC1 Preheader + // The pre-header of L1 is not necessary anymore. + assert(pred_begin(FC1.Preheader) == pred_end(FC1.Preheader)); + FC1.Preheader->getTerminator()->eraseFromParent(); + new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC1.Preheader, FC1.Header)); + + // Moves the phi nodes from the second to the first loops header block. + while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) { + if (SE.isSCEVable(PHI->getType())) + SE.forgetValue(PHI); + if (PHI->hasNUsesOrMore(1)) + PHI->moveBefore(&*FC0.Header->getFirstInsertionPt()); + else + PHI->eraseFromParent(); + } + + // Introduce new phi nodes in the second loop header to ensure + // exiting the first and jumping to the header of the second does not break + // the SSA property of the phis originally in the first loop. See also the + // comment above. + Instruction *L1HeaderIP = &FC1.Header->front(); + for (PHINode *LCPHI : OriginalFC0PHIs) { + int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch); + assert(L1LatchBBIdx >= 0 && + "Expected loop carried value to be rewired at this point!"); + + Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx); + + PHINode *L1HeaderPHI = PHINode::Create( + LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP); + L1HeaderPHI->addIncoming(LCV, FC0.Latch); + L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()), + FC0.ExitingBlock); + + LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI); + } + + // Update the latches + + // Replace latch terminator destinations. + FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); + FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); + + // If FC0.Latch and FC0.ExitingBlock are the same then we have already + // performed the updates above. + if (FC0.Latch != FC0.ExitingBlock) + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Insert, FC0.Latch, FC1.Header)); + + TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, + FC0.Latch, FC0.Header)); + TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert, + FC1.Latch, FC0.Header)); + TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, + FC1.Latch, FC1.Header)); + + // All done + // Apply the updates to the Dominator Tree and cleanup. + + assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) && + "FC1GuardBlock has successors!!"); + assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) && + "FC1GuardBlock has predecessors!!"); + + // Update DT/PDT + DTU.applyUpdates(TreeUpdates); + + LI.removeBlock(FC1.Preheader); + DTU.deleteBB(FC1.Preheader); + DTU.deleteBB(FC0.ExitBlock); + DTU.flush(); + + // Is there a way to keep SE up-to-date so we don't need to forget the loops + // and rebuild the information in subsequent passes of fusion? + SE.forgetLoop(FC1.L); + SE.forgetLoop(FC0.L); + + // Merge the loops. + SmallVector<BasicBlock *, 8> Blocks(FC1.L->block_begin(), + FC1.L->block_end()); + for (BasicBlock *BB : Blocks) { + FC0.L->addBlockEntry(BB); + FC1.L->removeBlockFromLoop(BB); + if (LI.getLoopFor(BB) != FC1.L) + continue; + LI.changeLoopFor(BB, FC0.L); + } + while (!FC1.L->empty()) { + const auto &ChildLoopIt = FC1.L->begin(); + Loop *ChildLoop = *ChildLoopIt; + FC1.L->removeChildLoop(ChildLoopIt); + FC0.L->addChildLoop(ChildLoop); + } + + // Delete the now empty loop L1. + LI.erase(FC1.L); + +#ifndef NDEBUG + assert(!verifyFunction(*FC0.Header->getParent(), &errs())); + assert(DT.verify(DominatorTree::VerificationLevel::Fast)); + assert(PDT.verify()); + LI.verify(DT); + SE.verify(); +#endif LLVM_DEBUG(dbgs() << "Fusion done:\n"); @@ -1177,6 +1596,7 @@ struct LoopFuseLegacy : public FunctionPass { return LF.fuseLoops(F); } }; +} // namespace PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) { auto &LI = AM.getResult<LoopAnalysis>(F); |