1 files changed, 91 insertions, 70 deletions
diff --git a/lib/Transforms/Utils/LoopUnrollPeel.cpp b/lib/Transforms/Utils/LoopUnrollPeel.cpp
index 005306cf1898..58e42074f963 100644
--- a/lib/Transforms/Utils/LoopUnrollPeel.cpp
+++ b/lib/Transforms/Utils/LoopUnrollPeel.cpp
@@ -62,9 +62,11 @@ static cl::opt<unsigned> UnrollForcePeelCount(
     cl::desc("Force a peel count regardless of profiling information."));
 
 static cl::opt<bool> UnrollPeelMultiDeoptExit(
-    "unroll-peel-multi-deopt-exit", cl::init(false), cl::Hidden,
+    "unroll-peel-multi-deopt-exit", cl::init(true), cl::Hidden,
     cl::desc("Allow peeling of loops with multiple deopt exits."));
 
+static const char *PeeledCountMetaData = "llvm.loop.peeled.count";
+
 // Designates that a Phi is estimated to become invariant after an "infinite"
 // number of loop iterations (i.e. only may become an invariant if the loop is
 // fully unrolled).
@@ -275,6 +277,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
     LLVM_DEBUG(dbgs() << "Force-peeling first " << UnrollForcePeelCount
                       << " iterations.\n");
     UP.PeelCount = UnrollForcePeelCount;
+    UP.PeelProfiledIterations = true;
     return;
   }
 
@@ -282,6 +285,13 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
   if (!UP.AllowPeeling)
     return;
 
+  unsigned AlreadyPeeled = 0;
+  if (auto Peeled = getOptionalIntLoopAttribute(L, PeeledCountMetaData))
+    AlreadyPeeled = *Peeled;
+  // Stop if we already peeled off the maximum number of iterations.
+  if (AlreadyPeeled >= UnrollPeelMaxCount)
+    return;
+
   // Here we try to get rid of Phis which become invariants after 1, 2, ..., N
   // iterations of the loop. For this we compute the number for iterations after
   // which every Phi is guaranteed to become an invariant, and try to peel the
@@ -317,11 +327,14 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
       DesiredPeelCount = std::min(DesiredPeelCount, MaxPeelCount);
       // Consider max peel count limitation.
       assert(DesiredPeelCount > 0 && "Wrong loop size estimation?");
-      LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount
-                        << " iteration(s) to turn"
-                        << " some Phis into invariants.\n");
-      UP.PeelCount = DesiredPeelCount;
-      return;
+      if (DesiredPeelCount + AlreadyPeeled <= UnrollPeelMaxCount) {
+        LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount
+                          << " iteration(s) to turn"
+                          << " some Phis into invariants.\n");
+        UP.PeelCount = DesiredPeelCount;
+        UP.PeelProfiledIterations = false;
+        return;
+      }
     }
   }
 
@@ -330,6 +343,9 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
   if (TripCount)
     return;
 
+  // Do not apply profile base peeling if it is disabled.
+  if (!UP.PeelProfiledIterations)
+    return;
   // If we don't know the trip count, but have reason to believe the average
   // trip count is low, peeling should be beneficial, since we will usually
   // hit the peeled section.
@@ -344,7 +360,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
                       << "\n");
 
     if (*PeelCount) {
-      if ((*PeelCount <= UnrollPeelMaxCount) &&
+      if ((*PeelCount + AlreadyPeeled <= UnrollPeelMaxCount) &&
           (LoopSize * (*PeelCount + 1) <= UP.Threshold)) {
         LLVM_DEBUG(dbgs() << "Peeling first " << *PeelCount
                           << " iterations.\n");
@@ -352,6 +368,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
         return;
       }
       LLVM_DEBUG(dbgs() << "Requested peel count: " << *PeelCount << "\n");
+      LLVM_DEBUG(dbgs() << "Already peel count: " << AlreadyPeeled << "\n");
       LLVM_DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");
       LLVM_DEBUG(dbgs() << "Peel cost: " << LoopSize * (*PeelCount + 1)
                         << "\n");
@@ -364,88 +381,77 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
 /// iteration.
 /// This sets the branch weights for the latch of the recently peeled off loop
 /// iteration correctly.
-/// Our goal is to make sure that:
-/// a) The total weight of all the copies of the loop body is preserved.
-/// b) The total weight of the loop exit is preserved.
-/// c) The body weight is reasonably distributed between the peeled iterations.
+/// Let F is a weight of the edge from latch to header.
+/// Let E is a weight of the edge from latch to exit.
+/// F/(F+E) is a probability to go to loop and E/(F+E) is a probability to
+/// go to exit.
+/// Then, Estimated TripCount = F / E.
+/// For I-th (counting from 0) peeled off iteration we set the the weights for
+/// the peeled latch as (TC - I, 1). It gives us reasonable distribution,
+/// The probability to go to exit 1/(TC-I) increases. At the same time
+/// the estimated trip count of remaining loop reduces by I.
+/// To avoid dealing with division rounding we can just multiple both part
+/// of weights to E and use weight as (F - I * E, E).
 ///
 /// \param Header The copy of the header block that belongs to next iteration.
 /// \param LatchBR The copy of the latch branch that belongs to this iteration.
-/// \param IterNumber The serial number of the iteration that was just
-/// peeled off.
-/// \param AvgIters The average number of iterations we expect the loop to have.
-/// \param[in,out] PeeledHeaderWeight The total number of dynamic loop
-/// iterations that are unaccounted for. As an input, it represents the number
-/// of times we expect to enter the header of the iteration currently being
-/// peeled off. The output is the number of times we expect to enter the
-/// header of the next iteration.
+/// \param[in,out] FallThroughWeight The weight of the edge from latch to
+/// header before peeling (in) and after peeled off one iteration (out).
 static void updateBranchWeights(BasicBlock *Header, BranchInst *LatchBR,
-                                unsigned IterNumber, unsigned AvgIters,
-                                uint64_t &PeeledHeaderWeight) {
-  if (!PeeledHeaderWeight)
+                                uint64_t ExitWeight,
+                                uint64_t &FallThroughWeight) {
+  // FallThroughWeight is 0 means that there is no branch weights on original
+  // latch block or estimated trip count is zero.
+  if (!FallThroughWeight)
     return;
-  // FIXME: Pick a more realistic distribution.
-  // Currently the proportion of weight we assign to the fall-through
-  // side of the branch drops linearly with the iteration number, and we use
-  // a 0.9 fudge factor to make the drop-off less sharp...
-  uint64_t FallThruWeight =
-      PeeledHeaderWeight * ((float)(AvgIters - IterNumber) / AvgIters * 0.9);
-  uint64_t ExitWeight = PeeledHeaderWeight - FallThruWeight;
-  PeeledHeaderWeight -= ExitWeight;
 
   unsigned HeaderIdx = (LatchBR->getSuccessor(0) == Header ? 0 : 1);
   MDBuilder MDB(LatchBR->getContext());
   MDNode *WeightNode =
-      HeaderIdx ? MDB.createBranchWeights(ExitWeight, FallThruWeight)
-                : MDB.createBranchWeights(FallThruWeight, ExitWeight);
+      HeaderIdx ? MDB.createBranchWeights(ExitWeight, FallThroughWeight)
+                : MDB.createBranchWeights(FallThroughWeight, ExitWeight);
   LatchBR->setMetadata(LLVMContext::MD_prof, WeightNode);
+  FallThroughWeight =
+      FallThroughWeight > ExitWeight ? FallThroughWeight - ExitWeight : 1;
 }
 
 /// Initialize the weights.
 ///
 /// \param Header The header block.
 /// \param LatchBR The latch branch.
-/// \param AvgIters The average number of iterations we expect the loop to have.
-/// \param[out] ExitWeight The # of times the edge from Latch to Exit is taken.
-/// \param[out] CurHeaderWeight The # of times the header is executed.
+/// \param[out] ExitWeight The weight of the edge from Latch to Exit.
+/// \param[out] FallThroughWeight The weight of the edge from Latch to Header.
 static void initBranchWeights(BasicBlock *Header, BranchInst *LatchBR,
-                              unsigned AvgIters, uint64_t &ExitWeight,
-                              uint64_t &CurHeaderWeight) {
+                              uint64_t &ExitWeight,
+                              uint64_t &FallThroughWeight) {
   uint64_t TrueWeight, FalseWeight;
   if (!LatchBR->extractProfMetadata(TrueWeight, FalseWeight))
     return;
   unsigned HeaderIdx = LatchBR->getSuccessor(0) == Header ? 0 : 1;
   ExitWeight = HeaderIdx ? TrueWeight : FalseWeight;
-  // The # of times the loop body executes is the sum of the exit block
-  // is taken and the # of times the backedges are taken.
-  CurHeaderWeight = TrueWeight + FalseWeight;
+  FallThroughWeight = HeaderIdx ? FalseWeight : TrueWeight;
 }
 
 /// Update the weights of original Latch block after peeling off all iterations.
 ///
 /// \param Header The header block.
 /// \param LatchBR The latch branch.
-/// \param ExitWeight The weight of the edge from Latch to Exit block.
-/// \param CurHeaderWeight The # of time the header is executed.
+/// \param ExitWeight The weight of the edge from Latch to Exit.
+/// \param FallThroughWeight The weight of the edge from Latch to Header.
 static void fixupBranchWeights(BasicBlock *Header, BranchInst *LatchBR,
-                               uint64_t ExitWeight, uint64_t CurHeaderWeight) {
-  // Adjust the branch weights on the loop exit.
-  if (!ExitWeight)
+                               uint64_t ExitWeight,
+                               uint64_t FallThroughWeight) {
+  // FallThroughWeight is 0 means that there is no branch weights on original
+  // latch block or estimated trip count is zero.
+  if (!FallThroughWeight)
     return;
 
-  // The backedge count is the difference of current header weight and
-  // current loop exit weight. If the current header weight is smaller than
-  // the current loop exit weight, we mark the loop backedge weight as 1.
-  uint64_t BackEdgeWeight = 0;
-  if (ExitWeight < CurHeaderWeight)
-    BackEdgeWeight = CurHeaderWeight - ExitWeight;
-  else
-    BackEdgeWeight = 1;
+  // Sets the branch weights on the loop exit.
   MDBuilder MDB(LatchBR->getContext());
   unsigned HeaderIdx = LatchBR->getSuccessor(0) == Header ? 0 : 1;
   MDNode *WeightNode =
-      HeaderIdx ? MDB.createBranchWeights(ExitWeight, BackEdgeWeight)
-                : MDB.createBranchWeights(BackEdgeWeight, ExitWeight);
+      HeaderIdx ? MDB.createBranchWeights(ExitWeight, FallThroughWeight)
+                : MDB.createBranchWeights(FallThroughWeight, ExitWeight);
   LatchBR->setMetadata(LLVMContext::MD_prof, WeightNode);
 }
 
@@ -586,11 +592,30 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
 
   DenseMap<BasicBlock *, BasicBlock *> ExitIDom;
   if (DT) {
+    // We'd like to determine the idom of exit block after peeling one
+    // iteration.
+    // Let Exit is exit block.
+    // Let ExitingSet - is a set of predecessors of Exit block. They are exiting
+    // blocks.
+    // Let Latch' and ExitingSet' are copies after a peeling.
+    // We'd like to find an idom'(Exit) - idom of Exit after peeling.
+    // It is an evident that idom'(Exit) will be the nearest common dominator
+    // of ExitingSet and ExitingSet'.
+    // idom(Exit) is a nearest common dominator of ExitingSet.
+    // idom(Exit)' is a nearest common dominator of ExitingSet'.
+    // Taking into account that we have a single Latch, Latch' will dominate
+    // Header and idom(Exit).
+    // So the idom'(Exit) is nearest common dominator of idom(Exit)' and Latch'.
+    // All these basic blocks are in the same loop, so what we find is
+    // (nearest common dominator of idom(Exit) and Latch)'.
+    // In the loop below we remember nearest common dominator of idom(Exit) and
+    // Latch to update idom of Exit later.
     assert(L->hasDedicatedExits() && "No dedicated exits?");
     for (auto Edge : ExitEdges) {
       if (ExitIDom.count(Edge.second))
         continue;
-      BasicBlock *BB = DT->getNode(Edge.second)->getIDom()->getBlock();
+      BasicBlock *BB = DT->findNearestCommonDominator(
+          DT->getNode(Edge.second)->getIDom()->getBlock(), Latch);
       assert(L->contains(BB) && "IDom is not in a loop");
       ExitIDom[Edge.second] = BB;
     }
@@ -659,23 +684,14 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
   // newly created branches.
   BranchInst *LatchBR =
       cast<BranchInst>(cast<BasicBlock>(Latch)->getTerminator());
-  uint64_t ExitWeight = 0, CurHeaderWeight = 0;
-  initBranchWeights(Header, LatchBR, PeelCount, ExitWeight, CurHeaderWeight);
+  uint64_t ExitWeight = 0, FallThroughWeight = 0;
+  initBranchWeights(Header, LatchBR, ExitWeight, FallThroughWeight);
 
   // For each peeled-off iteration, make a copy of the loop.
   for (unsigned Iter = 0; Iter < PeelCount; ++Iter) {
     SmallVector<BasicBlock *, 8> NewBlocks;
     ValueToValueMapTy VMap;
 
-    // Subtract the exit weight from the current header weight -- the exit
-    // weight is exactly the weight of the previous iteration's header.
-    // FIXME: due to the way the distribution is constructed, we need a
-    // guard here to make sure we don't end up with non-positive weights.
-    if (ExitWeight < CurHeaderWeight)
-      CurHeaderWeight -= ExitWeight;
-    else
-      CurHeaderWeight = 1;
-
     cloneLoopBlocks(L, Iter, InsertTop, InsertBot, ExitEdges, NewBlocks,
                     LoopBlocks, VMap, LVMap, DT, LI);
 
@@ -697,8 +713,7 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
     }
 
     auto *LatchBRCopy = cast<BranchInst>(VMap[LatchBR]);
-    updateBranchWeights(InsertBot, LatchBRCopy, Iter,
-                        PeelCount, ExitWeight);
+    updateBranchWeights(InsertBot, LatchBRCopy, ExitWeight, FallThroughWeight);
     // Remove Loop metadata from the latch branch instruction
     // because it is not the Loop's latch branch anymore.
     LatchBRCopy->setMetadata(LLVMContext::MD_loop, nullptr);
@@ -724,7 +739,13 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
     PHI->setIncomingValueForBlock(NewPreHeader, NewVal);
   }
 
-  fixupBranchWeights(Header, LatchBR, ExitWeight, CurHeaderWeight);
+  fixupBranchWeights(Header, LatchBR, ExitWeight, FallThroughWeight);
+
+  // Update Metadata for count of peeled off iterations.
+  unsigned AlreadyPeeled = 0;
+  if (auto Peeled = getOptionalIntLoopAttribute(L, PeeledCountMetaData))
+    AlreadyPeeled = *Peeled;
+  addStringMetadataToLoop(L, PeeledCountMetaData, AlreadyPeeled + PeelCount);
 
   if (Loop *ParentLoop = L->getParentLoop())
     L = ParentLoop;