1 files changed, 303 insertions, 39 deletions
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index d94b767c7b63..0eecec373736 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -67,6 +67,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/CodeMoverUtils.h"
 #include "llvm/Transforms/Utils/LoopPeel.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
 
 using namespace llvm;
 
@@ -101,6 +102,8 @@ STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block with "
 STATISTIC(NotRotated, "Candidate is not rotated");
 STATISTIC(OnlySecondCandidateIsGuarded,
           "The second candidate is guarded while the first one is not");
+STATISTIC(NumHoistedInsts, "Number of hoisted preheader instructions.");
+STATISTIC(NumSunkInsts, "Number of hoisted preheader instructions.");
 
 enum FusionDependenceAnalysisChoice {
   FUSION_DEPENDENCE_ANALYSIS_SCEV,
@@ -183,9 +186,8 @@ struct FusionCandidate {
 
   OptimizationRemarkEmitter &ORE;
 
-  FusionCandidate(Loop *L, DominatorTree &DT,
-                  const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE,
-                  TTI::PeelingPreferences PP)
+  FusionCandidate(Loop *L, DominatorTree &DT, const PostDominatorTree *PDT,
+                  OptimizationRemarkEmitter &ORE, TTI::PeelingPreferences PP)
       : Preheader(L->getLoopPreheader()), Header(L->getHeader()),
         ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()),
         Latch(L->getLoopLatch()), L(L), Valid(true),
@@ -387,7 +389,13 @@ struct FusionCandidateCompare {
   /// Comparison functor to sort two Control Flow Equivalent fusion candidates
   /// into dominance order.
   /// If LHS dominates RHS and RHS post-dominates LHS, return true;
-  /// IF RHS dominates LHS and LHS post-dominates RHS, return false;
+  /// If RHS dominates LHS and LHS post-dominates RHS, return false;
+  /// If both LHS and RHS are not dominating each other then, non-strictly
+  /// post dominate check will decide the order of candidates. If RHS
+  /// non-strictly post dominates LHS then, return true. If LHS non-strictly
+  /// post dominates RHS then, return false. If both are non-strictly post
+  /// dominate each other then, level in the post dominator tree will decide
+  /// the order of candidates.
   bool operator()(const FusionCandidate &LHS,
                   const FusionCandidate &RHS) const {
     const DominatorTree *DT = &(LHS.DT);
@@ -413,9 +421,29 @@ struct FusionCandidateCompare {
       return true;
     }
 
-    // If LHS does not dominate RHS and RHS does not dominate LHS then there is
-    // no dominance relationship between the two FusionCandidates. Thus, they
-    // should not be in the same set together.
+    // If two FusionCandidates are in the same level of dominator tree,
+    // they will not dominate each other, but may still be control flow
+    // equivalent. To sort those FusionCandidates, nonStrictlyPostDominate()
+    // function is needed.
+    bool WrongOrder =
+        nonStrictlyPostDominate(LHSEntryBlock, RHSEntryBlock, DT, LHS.PDT);
+    bool RightOrder =
+        nonStrictlyPostDominate(RHSEntryBlock, LHSEntryBlock, DT, LHS.PDT);
+    if (WrongOrder && RightOrder) {
+      // If common predecessor of LHS and RHS post dominates both
+      // FusionCandidates then, Order of FusionCandidate can be
+      // identified by its level in post dominator tree.
+      DomTreeNode *LNode = LHS.PDT->getNode(LHSEntryBlock);
+      DomTreeNode *RNode = LHS.PDT->getNode(RHSEntryBlock);
+      return LNode->getLevel() > RNode->getLevel();
+    } else if (WrongOrder)
+      return false;
+    else if (RightOrder)
+      return true;
+
+    // If LHS does not non-strict Postdominate RHS and RHS does not non-strict
+    // Postdominate LHS then, there is no dominance relationship between the
+    // two FusionCandidates. Thus, they should not be in the same set together.
     llvm_unreachable(
         "No dominance relationship between these fusion candidates!");
   }
@@ -427,7 +455,7 @@ using LoopVector = SmallVector<Loop *, 4>;
 // order. Thus, if FC0 comes *before* FC1 in a FusionCandidateSet, then FC0
 // dominates FC1 and FC1 post-dominates FC0.
 // std::set was chosen because we want a sorted data structure with stable
-// iterators. A subsequent patch to loop fusion will enable fusing non-ajdacent
+// iterators. A subsequent patch to loop fusion will enable fusing non-adjacent
 // loops by moving intervening code around. When this intervening code contains
 // loops, those loops will be moved also. The corresponding FusionCandidates
 // will also need to be moved accordingly. As this is done, having stable
@@ -528,7 +556,7 @@ private:
 #ifndef NDEBUG
 static void printLoopVector(const LoopVector &LV) {
   dbgs() << "****************************\n";
-  for (auto L : LV)
+  for (auto *L : LV)
     printLoop(*L, dbgs());
   dbgs() << "****************************\n";
 }
@@ -549,7 +577,6 @@ private:
   PostDominatorTree &PDT;
   OptimizationRemarkEmitter &ORE;
   AssumptionCache &AC;
-
   const TargetTransformInfo &TTI;
 
 public:
@@ -644,7 +671,7 @@ private:
   void collectFusionCandidates(const LoopVector &LV) {
     for (Loop *L : LV) {
       TTI::PeelingPreferences PP =
-          gatherPeelingPreferences(L, SE, TTI, None, None);
+          gatherPeelingPreferences(L, SE, TTI, std::nullopt, std::nullopt);
       FusionCandidate CurrCand(L, DT, &PDT, ORE, PP);
       if (!CurrCand.isEligibleForFusion(SE))
         continue;
@@ -699,23 +726,22 @@ private:
   /// stating whether or not the two candidates are known at compile time to
   /// have the same TripCount. The second is the difference in the two
   /// TripCounts. This information can be used later to determine whether or not
-  /// peeling can be performed on either one of the candiates.
-  std::pair<bool, Optional<unsigned>>
+  /// peeling can be performed on either one of the candidates.
+  std::pair<bool, std::optional<unsigned>>
   haveIdenticalTripCounts(const FusionCandidate &FC0,
                           const FusionCandidate &FC1) const {
-
     const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L);
     if (isa<SCEVCouldNotCompute>(TripCount0)) {
       UncomputableTripCount++;
       LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!");
-      return {false, None};
+      return {false, std::nullopt};
     }
 
     const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L);
     if (isa<SCEVCouldNotCompute>(TripCount1)) {
       UncomputableTripCount++;
       LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!");
-      return {false, None};
+      return {false, std::nullopt};
     }
 
     LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & "
@@ -740,10 +766,10 @@ private:
       LLVM_DEBUG(dbgs() << "Loop(s) do not have a single exit point or do not "
                            "have a constant number of iterations. Peeling "
                            "is not benefical\n");
-      return {false, None};
+      return {false, std::nullopt};
     }
 
-    Optional<unsigned> Difference = None;
+    std::optional<unsigned> Difference;
     int Diff = TC0 - TC1;
 
     if (Diff > 0)
@@ -767,7 +793,8 @@ private:
     LLVM_DEBUG(dbgs() << "Attempting to peel first " << PeelCount
                       << " iterations of the first loop. \n");
 
-    FC0.Peeled = peelLoop(FC0.L, PeelCount, &LI, &SE, DT, &AC, true);
+    ValueToValueMapTy VMap;
+    FC0.Peeled = peelLoop(FC0.L, PeelCount, &LI, &SE, DT, &AC, true, VMap);
     if (FC0.Peeled) {
       LLVM_DEBUG(dbgs() << "Done Peeling\n");
 
@@ -807,7 +834,7 @@ private:
         }
         // Cannot modify the predecessors inside the above loop as it will cause
         // the iterators to be nullptrs, causing memory errors.
-        for (Instruction *CurrentBranch: WorkList) {
+        for (Instruction *CurrentBranch : WorkList) {
           BasicBlock *Succ = CurrentBranch->getSuccessor(0);
           if (Succ == BB)
             Succ = CurrentBranch->getSuccessor(1);
@@ -858,12 +885,12 @@ private:
           // Check if the candidates have identical tripcounts (first value of
           // pair), and if not check the difference in the tripcounts between
           // the loops (second value of pair). The difference is not equal to
-          // None iff the loops iterate a constant number of times, and have a
-          // single exit.
-          std::pair<bool, Optional<unsigned>> IdenticalTripCountRes =
+          // std::nullopt iff the loops iterate a constant number of times, and
+          // have a single exit.
+          std::pair<bool, std::optional<unsigned>> IdenticalTripCountRes =
               haveIdenticalTripCounts(*FC0, *FC1);
           bool SameTripCount = IdenticalTripCountRes.first;
-          Optional<unsigned> TCDifference = IdenticalTripCountRes.second;
+          std::optional<unsigned> TCDifference = IdenticalTripCountRes.second;
 
           // Here we are checking that FC0 (the first loop) can be peeled, and
           // both loops have different tripcounts.
@@ -895,9 +922,10 @@ private:
             continue;
           }
 
-          if (!FC0->GuardBranch && FC1->GuardBranch) {
-            LLVM_DEBUG(dbgs() << "The second candidate is guarded while the "
-                                 "first one is not. Not fusing.\n");
+          if ((!FC0->GuardBranch && FC1->GuardBranch) ||
+              (FC0->GuardBranch && !FC1->GuardBranch)) {
+            LLVM_DEBUG(dbgs() << "The one of candidate is guarded while the "
+                                 "another one is not. Not fusing.\n");
             reportLoopFusion<OptimizationRemarkMissed>(
                 *FC0, *FC1, OnlySecondCandidateIsGuarded);
             continue;
@@ -914,16 +942,6 @@ private:
             continue;
           }
 
-          if (!isSafeToMoveBefore(*FC1->Preheader,
-                                  *FC0->Preheader->getTerminator(), DT, &PDT,
-                                  &DI)) {
-            LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe "
-                                 "instructions in preheader. Not fusing.\n");
-            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
-                                                       NonEmptyPreheader);
-            continue;
-          }
-
           if (FC0->GuardBranch) {
             assert(FC1->GuardBranch && "Expecting valid FC1 guard branch");
 
@@ -959,6 +977,31 @@ private:
             continue;
           }
 
+          // If the second loop has instructions in the pre-header, attempt to
+          // hoist them up to the first loop's pre-header or sink them into the
+          // body of the second loop.
+          SmallVector<Instruction *, 4> SafeToHoist;
+          SmallVector<Instruction *, 4> SafeToSink;
+          // At this point, this is the last remaining legality check.
+          // Which means if we can make this pre-header empty, we can fuse
+          // these loops
+          if (!isEmptyPreheader(*FC1)) {
+            LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty "
+                                 "preheader.\n");
+
+            // If it is not safe to hoist/sink all instructions in the
+            // pre-header, we cannot fuse these loops.
+            if (!collectMovablePreheaderInsts(*FC0, *FC1, SafeToHoist,
+                                              SafeToSink)) {
+              LLVM_DEBUG(dbgs() << "Could not hoist/sink all instructions in "
+                                   "Fusion Candidate Pre-header.\n"
+                                << "Not Fusing.\n");
+              reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                         NonEmptyPreheader);
+              continue;
+            }
+          }
+
           bool BeneficialToFuse = isBeneficialFusion(*FC0, *FC1);
           LLVM_DEBUG(dbgs()
                      << "\tFusion appears to be "
@@ -972,6 +1015,9 @@ private:
           // and profitable. At this point, start transforming the code and
           // perform fusion.
 
+          // Execute the hoist/sink operations on preheader instructions
+          movePreheaderInsts(*FC0, *FC1, SafeToHoist, SafeToSink);
+
           LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and "
                             << *FC1 << "\n");
 
@@ -1022,6 +1068,170 @@ private:
     return Fused;
   }
 
+  // Returns true if the instruction \p I can be hoisted to the end of the
+  // preheader of \p FC0. \p SafeToHoist contains the instructions that are
+  // known to be safe to hoist. The instructions encountered that cannot be
+  // hoisted are in \p NotHoisting.
+  // TODO: Move functionality into CodeMoverUtils
+  bool canHoistInst(Instruction &I,
+                    const SmallVector<Instruction *, 4> &SafeToHoist,
+                    const SmallVector<Instruction *, 4> &NotHoisting,
+                    const FusionCandidate &FC0) const {
+    const BasicBlock *FC0PreheaderTarget = FC0.Preheader->getSingleSuccessor();
+    assert(FC0PreheaderTarget &&
+           "Expected single successor for loop preheader.");
+
+    for (Use &Op : I.operands()) {
+      if (auto *OpInst = dyn_cast<Instruction>(Op)) {
+        bool OpHoisted = is_contained(SafeToHoist, OpInst);
+        // Check if we have already decided to hoist this operand. In this
+        // case, it does not dominate FC0 *yet*, but will after we hoist it.
+        if (!(OpHoisted || DT.dominates(OpInst, FC0PreheaderTarget))) {
+          return false;
+        }
+      }
+    }
+
+    // PHIs in FC1's header only have FC0 blocks as predecessors. PHIs
+    // cannot be hoisted and should be sunk to the exit of the fused loop.
+    if (isa<PHINode>(I))
+      return false;
+
+    // If this isn't a memory inst, hoisting is safe
+    if (!I.mayReadOrWriteMemory())
+      return true;
+
+    LLVM_DEBUG(dbgs() << "Checking if this mem inst can be hoisted.\n");
+    for (Instruction *NotHoistedInst : NotHoisting) {
+      if (auto D = DI.depends(&I, NotHoistedInst, true)) {
+        // Dependency is not read-before-write, write-before-read or
+        // write-before-write
+        if (D->isFlow() || D->isAnti() || D->isOutput()) {
+          LLVM_DEBUG(dbgs() << "Inst depends on an instruction in FC1's "
+                               "preheader that is not being hoisted.\n");
+          return false;
+        }
+      }
+    }
+
+    for (Instruction *ReadInst : FC0.MemReads) {
+      if (auto D = DI.depends(ReadInst, &I, true)) {
+        // Dependency is not read-before-write
+        if (D->isAnti()) {
+          LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC0.\n");
+          return false;
+        }
+      }
+    }
+
+    for (Instruction *WriteInst : FC0.MemWrites) {
+      if (auto D = DI.depends(WriteInst, &I, true)) {
+        // Dependency is not write-before-read or write-before-write
+        if (D->isFlow() || D->isOutput()) {
+          LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC0.\n");
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
+  // Returns true if the instruction \p I can be sunk to the top of the exit
+  // block of \p FC1.
+  // TODO: Move functionality into CodeMoverUtils
+  bool canSinkInst(Instruction &I, const FusionCandidate &FC1) const {
+    for (User *U : I.users()) {
+      if (auto *UI{dyn_cast<Instruction>(U)}) {
+        // Cannot sink if user in loop
+        // If FC1 has phi users of this value, we cannot sink it into FC1.
+        if (FC1.L->contains(UI)) {
+          // Cannot hoist or sink this instruction. No hoisting/sinking
+          // should take place, loops should not fuse
+          return false;
+        }
+      }
+    }
+
+    // If this isn't a memory inst, sinking is safe
+    if (!I.mayReadOrWriteMemory())
+      return true;
+
+    for (Instruction *ReadInst : FC1.MemReads) {
+      if (auto D = DI.depends(&I, ReadInst, true)) {
+        // Dependency is not write-before-read
+        if (D->isFlow()) {
+          LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC1.\n");
+          return false;
+        }
+      }
+    }
+
+    for (Instruction *WriteInst : FC1.MemWrites) {
+      if (auto D = DI.depends(&I, WriteInst, true)) {
+        // Dependency is not write-before-write or read-before-write
+        if (D->isOutput() || D->isAnti()) {
+          LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC1.\n");
+          return false;
+        }
+      }
+    }
+
+    return true;
+  }
+
+  /// Collect instructions in the \p FC1 Preheader that can be hoisted
+  /// to the \p FC0 Preheader or sunk into the \p FC1 Body
+  bool collectMovablePreheaderInsts(
+      const FusionCandidate &FC0, const FusionCandidate &FC1,
+      SmallVector<Instruction *, 4> &SafeToHoist,
+      SmallVector<Instruction *, 4> &SafeToSink) const {
+    BasicBlock *FC1Preheader = FC1.Preheader;
+    // Save the instructions that are not being hoisted, so we know not to hoist
+    // mem insts that they dominate.
+    SmallVector<Instruction *, 4> NotHoisting;
+
+    for (Instruction &I : *FC1Preheader) {
+      // Can't move a branch
+      if (&I == FC1Preheader->getTerminator())
+        continue;
+      // If the instruction has side-effects, give up.
+      // TODO: The case of mayReadFromMemory we can handle but requires
+      // additional work with a dependence analysis so for now we give
+      // up on memory reads.
+      if (I.mayThrow() || !I.willReturn()) {
+        LLVM_DEBUG(dbgs() << "Inst: " << I << " may throw or won't return.\n");
+        return false;
+      }
+
+      LLVM_DEBUG(dbgs() << "Checking Inst: " << I << "\n");
+
+      if (I.isAtomic() || I.isVolatile()) {
+        LLVM_DEBUG(
+            dbgs() << "\tInstruction is volatile or atomic. Cannot move it.\n");
+        return false;
+      }
+
+      if (canHoistInst(I, SafeToHoist, NotHoisting, FC0)) {
+        SafeToHoist.push_back(&I);
+        LLVM_DEBUG(dbgs() << "\tSafe to hoist.\n");
+      } else {
+        LLVM_DEBUG(dbgs() << "\tCould not hoist. Trying to sink...\n");
+        NotHoisting.push_back(&I);
+
+        if (canSinkInst(I, FC1)) {
+          SafeToSink.push_back(&I);
+          LLVM_DEBUG(dbgs() << "\tSafe to sink.\n");
+        } else {
+          LLVM_DEBUG(dbgs() << "\tCould not sink.\n");
+          return false;
+        }
+      }
+    }
+    LLVM_DEBUG(
+        dbgs() << "All preheader instructions could be sunk or hoisted!\n");
+    return true;
+  }
+
   /// Rewrite all additive recurrences in a SCEV to use a new loop.
   class AddRecLoopReplacer : public SCEVRewriteVisitor<AddRecLoopReplacer> {
   public:
@@ -1034,7 +1244,7 @@ private:
       const Loop *ExprL = Expr->getLoop();
       SmallVector<const SCEV *, 2> Operands;
       if (ExprL == &OldL) {
-        Operands.append(Expr->op_begin(), Expr->op_end());
+        append_range(Operands, Expr->operands());
         return SE.getAddRecExpr(Operands, &NewL, Expr->getNoWrapFlags());
       }
 
@@ -1235,6 +1445,46 @@ private:
       return FC0.ExitBlock == FC1.getEntryBlock();
   }
 
+  bool isEmptyPreheader(const FusionCandidate &FC) const {
+    return FC.Preheader->size() == 1;
+  }
+
+  /// Hoist \p FC1 Preheader instructions to \p FC0 Preheader
+  /// and sink others into the body of \p FC1.
+  void movePreheaderInsts(const FusionCandidate &FC0,
+                          const FusionCandidate &FC1,
+                          SmallVector<Instruction *, 4> &HoistInsts,
+                          SmallVector<Instruction *, 4> &SinkInsts) const {
+    // All preheader instructions except the branch must be hoisted or sunk
+    assert(HoistInsts.size() + SinkInsts.size() == FC1.Preheader->size() - 1 &&
+           "Attempting to sink and hoist preheader instructions, but not all "
+           "the preheader instructions are accounted for.");
+
+    NumHoistedInsts += HoistInsts.size();
+    NumSunkInsts += SinkInsts.size();
+
+    LLVM_DEBUG(if (VerboseFusionDebugging) {
+      if (!HoistInsts.empty())
+        dbgs() << "Hoisting: \n";
+      for (Instruction *I : HoistInsts)
+        dbgs() << *I << "\n";
+      if (!SinkInsts.empty())
+        dbgs() << "Sinking: \n";
+      for (Instruction *I : SinkInsts)
+        dbgs() << *I << "\n";
+    });
+
+    for (Instruction *I : HoistInsts) {
+      assert(I->getParent() == FC1.Preheader);
+      I->moveBefore(FC0.Preheader->getTerminator());
+    }
+    // insert instructions in reverse order to maintain dominance relationship
+    for (Instruction *I : reverse(SinkInsts)) {
+      assert(I->getParent() == FC1.Preheader);
+      I->moveBefore(&*FC1.ExitBlock->getFirstInsertionPt());
+    }
+  }
+
   /// Determine if two fusion candidates have identical guards
   ///
   /// This method will determine if two fusion candidates have the same guards.
@@ -1480,6 +1730,7 @@ private:
     // mergeLatch may remove the only block in FC1.
     SE.forgetLoop(FC1.L);
     SE.forgetLoop(FC0.L);
+    SE.forgetLoopDispositions();
 
     // Move instructions from FC0.Latch to FC1.Latch.
     // Note: mergeLatch requires an updated DT.
@@ -1772,6 +2023,7 @@ private:
     // mergeLatch may remove the only block in FC1.
     SE.forgetLoop(FC1.L);
     SE.forgetLoop(FC0.L);
+    SE.forgetLoopDispositions();
 
     // Move instructions from FC0.Latch to FC1.Latch.
     // Note: mergeLatch requires an updated DT.
@@ -1838,6 +2090,7 @@ struct LoopFuseLegacy : public FunctionPass {
   bool runOnFunction(Function &F) override {
     if (skipFunction(F))
       return false;
+
     auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI();
@@ -1866,8 +2119,19 @@ PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) {
   const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
   const DataLayout &DL = F.getParent()->getDataLayout();
 
+  // Ensure loops are in simplifed form which is a pre-requisite for loop fusion
+  // pass. Added only for new PM since the legacy PM has already added
+  // LoopSimplify pass as a dependency.
+  bool Changed = false;
+  for (auto &L : LI) {
+    Changed |=
+        simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */);
+  }
+  if (Changed)
+    PDT.recalculate(F);
+
   LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI);
-  bool Changed = LF.fuseLoops(F);
+  Changed |= LF.fuseLoops(F);
   if (!Changed)
     return PreservedAnalyses::all();