1 files changed, 845 insertions, 202 deletions
diff --git a/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 40e3840e6b0b..4cfc128a8c1d 100644
--- a/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -32,14 +32,15 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
-#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/TailDuplicator.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
@@ -49,6 +50,8 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
+#include <functional>
+#include <utility>
 using namespace llvm;
 
 #define DEBUG_TYPE "block-placement"
@@ -82,19 +85,6 @@ static cl::opt<unsigned> ExitBlockBias(
 // Definition:
 // - Outlining: placement of a basic block outside the chain or hot path.
 
-static cl::opt<bool> OutlineOptionalBranches(
-    "outline-optional-branches",
-    cl::desc("Outlining optional branches will place blocks that are optional "
-              "branches, i.e. branches with a common post dominator, outside "
-              "the hot path or chain"),
-    cl::init(false), cl::Hidden);
-
-static cl::opt<unsigned> OutlineOptionalThreshold(
-    "outline-optional-threshold",
-    cl::desc("Don't outline optional branches that are a single block with an "
-             "instruction count below this threshold"),
-    cl::init(4), cl::Hidden);
-
 static cl::opt<unsigned> LoopToColdBlockRatio(
     "loop-to-cold-block-ratio",
     cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
@@ -136,20 +126,47 @@ BranchFoldPlacement("branch-fold-placement",
               cl::init(true), cl::Hidden);
 
 // Heuristic for tail duplication.
-static cl::opt<unsigned> TailDuplicatePlacementThreshold(
+static cl::opt<unsigned> TailDupPlacementThreshold(
     "tail-dup-placement-threshold",
     cl::desc("Instruction cutoff for tail duplication during layout. "
              "Tail merging during layout is forced to have a threshold "
              "that won't conflict."), cl::init(2),
     cl::Hidden);
 
+// Heuristic for tail duplication.
+static cl::opt<unsigned> TailDupPlacementPenalty(
+    "tail-dup-placement-penalty",
+    cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. "
+             "Copying can increase fallthrough, but it also increases icache "
+             "pressure. This parameter controls the penalty to account for that. "
+             "Percent as integer."),
+    cl::init(2),
+    cl::Hidden);
+
+// Heuristic for triangle chains.
+static cl::opt<unsigned> TriangleChainCount(
+    "triangle-chain-count",
+    cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
+             "triangle tail duplication heuristic to kick in. 0 to disable."),
+    cl::init(2),
+    cl::Hidden);
+
 extern cl::opt<unsigned> StaticLikelyProb;
 extern cl::opt<unsigned> ProfileLikelyProb;
 
+// Internal option used to control BFI display only after MBP pass.
+// Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
+// -view-block-layout-with-bfi=
+extern cl::opt<GVDAGType> ViewBlockLayoutWithBFI;
+
+// Command line option to specify the name of the function for CFG dump
+// Defined in Analysis/BlockFrequencyInfo.cpp:  -view-bfi-func-name=
+extern cl::opt<std::string> ViewBlockFreqFuncName;
+
 namespace {
 class BlockChain;
 /// \brief Type for our function-wide basic block -> block chain mapping.
-typedef DenseMap<MachineBasicBlock *, BlockChain *> BlockToChainMapType;
+typedef DenseMap<const MachineBasicBlock *, BlockChain *> BlockToChainMapType;
 }
 
 namespace {
@@ -193,12 +210,15 @@ public:
 
   /// \brief Iterator over blocks within the chain.
   typedef SmallVectorImpl<MachineBasicBlock *>::iterator iterator;
+  typedef SmallVectorImpl<MachineBasicBlock *>::const_iterator const_iterator;
 
   /// \brief Beginning of blocks within the chain.
   iterator begin() { return Blocks.begin(); }
+  const_iterator begin() const { return Blocks.begin(); }
 
   /// \brief End of blocks within the chain.
   iterator end() { return Blocks.end(); }
+  const_iterator end() const { return Blocks.end(); }
 
   bool remove(MachineBasicBlock* BB) {
     for(iterator i = begin(); i != end(); ++i) {
@@ -264,12 +284,28 @@ public:
 namespace {
 class MachineBlockPlacement : public MachineFunctionPass {
   /// \brief A typedef for a block filter set.
-  typedef SmallSetVector<MachineBasicBlock *, 16> BlockFilterSet;
+  typedef SmallSetVector<const MachineBasicBlock *, 16> BlockFilterSet;
+
+  /// Pair struct containing basic block and taildup profitiability
+  struct BlockAndTailDupResult {
+    MachineBasicBlock *BB;
+    bool ShouldTailDup;
+  };
+
+  /// Triple struct containing edge weight and the edge.
+  struct WeightedEdge {
+    BlockFrequency Weight;
+    MachineBasicBlock *Src;
+    MachineBasicBlock *Dest;
+  };
 
   /// \brief work lists of blocks that are ready to be laid out
   SmallVector<MachineBasicBlock *, 16> BlockWorkList;
   SmallVector<MachineBasicBlock *, 16> EHPadWorkList;
 
+  /// Edges that have already been computed as optimal.
+  DenseMap<const MachineBasicBlock *, BlockAndTailDupResult> ComputedEdges;
+
   /// \brief Machine Function
   MachineFunction *F;
 
@@ -294,7 +330,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
   const TargetLoweringBase *TLI;
 
   /// \brief A handle to the post dominator tree.
-  MachineDominatorTree *MDT;
+  MachinePostDominatorTree *MPDT;
 
   /// \brief Duplicator used to duplicate tails during placement.
   ///
@@ -303,10 +339,6 @@ class MachineBlockPlacement : public MachineFunctionPass {
   /// must be done inline.
   TailDuplicator TailDup;
 
-  /// \brief A set of blocks that are unavoidably execute, i.e. they dominate
-  /// all terminators of the MachineFunction.
-  SmallPtrSet<MachineBasicBlock *, 4> UnavoidableBlocks;
-
   /// \brief Allocator and owner of BlockChain structures.
   ///
   /// We build BlockChains lazily while processing the loop structure of
@@ -322,7 +354,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
   /// BlockChain it participates in, if any. We use it to, among other things,
   /// allow implicitly defining edges between chains as the existing edges
   /// between basic blocks.
-  DenseMap<MachineBasicBlock *, BlockChain *> BlockToChain;
+  DenseMap<const MachineBasicBlock *, BlockChain *> BlockToChain;
 
 #ifndef NDEBUG
   /// The set of basic blocks that have terminators that cannot be fully
@@ -334,75 +366,107 @@ class MachineBlockPlacement : public MachineFunctionPass {
 
   /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
   /// if the count goes to 0, add them to the appropriate work list.
-  void markChainSuccessors(BlockChain &Chain, MachineBasicBlock *LoopHeaderBB,
-                           const BlockFilterSet *BlockFilter = nullptr);
+  void markChainSuccessors(
+      const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
+      const BlockFilterSet *BlockFilter = nullptr);
 
   /// Decrease the UnscheduledPredecessors count for a single block, and
   /// if the count goes to 0, add them to the appropriate work list.
   void markBlockSuccessors(
-      BlockChain &Chain, MachineBasicBlock *BB, MachineBasicBlock *LoopHeaderBB,
+      const BlockChain &Chain, const MachineBasicBlock *BB,
+      const MachineBasicBlock *LoopHeaderBB,
       const BlockFilterSet *BlockFilter = nullptr);
 
-
   BranchProbability
-  collectViableSuccessors(MachineBasicBlock *BB, BlockChain &Chain,
-                          const BlockFilterSet *BlockFilter,
-                          SmallVector<MachineBasicBlock *, 4> &Successors);
-  bool shouldPredBlockBeOutlined(MachineBasicBlock *BB, MachineBasicBlock *Succ,
-                                 BlockChain &Chain,
-                                 const BlockFilterSet *BlockFilter,
-                                 BranchProbability SuccProb,
-                                 BranchProbability HotProb);
+  collectViableSuccessors(
+      const MachineBasicBlock *BB, const BlockChain &Chain,
+      const BlockFilterSet *BlockFilter,
+      SmallVector<MachineBasicBlock *, 4> &Successors);
+  bool shouldPredBlockBeOutlined(
+      const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
+      const BlockChain &Chain, const BlockFilterSet *BlockFilter,
+      BranchProbability SuccProb, BranchProbability HotProb);
   bool repeatedlyTailDuplicateBlock(
       MachineBasicBlock *BB, MachineBasicBlock *&LPred,
-      MachineBasicBlock *LoopHeaderBB,
+      const MachineBasicBlock *LoopHeaderBB,
       BlockChain &Chain, BlockFilterSet *BlockFilter,
       MachineFunction::iterator &PrevUnplacedBlockIt);
-  bool maybeTailDuplicateBlock(MachineBasicBlock *BB, MachineBasicBlock *LPred,
-                               const BlockChain &Chain,
-                               BlockFilterSet *BlockFilter,
-                               MachineFunction::iterator &PrevUnplacedBlockIt,
-                               bool &DuplicatedToPred);
-  bool
-  hasBetterLayoutPredecessor(MachineBasicBlock *BB, MachineBasicBlock *Succ,
-                             BlockChain &SuccChain, BranchProbability SuccProb,
-                             BranchProbability RealSuccProb, BlockChain &Chain,
-                             const BlockFilterSet *BlockFilter);
-  MachineBasicBlock *selectBestSuccessor(MachineBasicBlock *BB,
-                                         BlockChain &Chain,
-                                         const BlockFilterSet *BlockFilter);
-  MachineBasicBlock *
-  selectBestCandidateBlock(BlockChain &Chain,
-                           SmallVectorImpl<MachineBasicBlock *> &WorkList);
-  MachineBasicBlock *
-  getFirstUnplacedBlock(const BlockChain &PlacedChain,
-                        MachineFunction::iterator &PrevUnplacedBlockIt,
-                        const BlockFilterSet *BlockFilter);
+  bool maybeTailDuplicateBlock(
+      MachineBasicBlock *BB, MachineBasicBlock *LPred,
+      BlockChain &Chain, BlockFilterSet *BlockFilter,
+      MachineFunction::iterator &PrevUnplacedBlockIt,
+      bool &DuplicatedToPred);
+  bool hasBetterLayoutPredecessor(
+      const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
+      const BlockChain &SuccChain, BranchProbability SuccProb,
+      BranchProbability RealSuccProb, const BlockChain &Chain,
+      const BlockFilterSet *BlockFilter);
+  BlockAndTailDupResult selectBestSuccessor(
+      const MachineBasicBlock *BB, const BlockChain &Chain,
+      const BlockFilterSet *BlockFilter);
+  MachineBasicBlock *selectBestCandidateBlock(
+      const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList);
+  MachineBasicBlock *getFirstUnplacedBlock(
+      const BlockChain &PlacedChain,
+      MachineFunction::iterator &PrevUnplacedBlockIt,
+      const BlockFilterSet *BlockFilter);
 
   /// \brief Add a basic block to the work list if it is appropriate.
   ///
   /// If the optional parameter BlockFilter is provided, only MBB
   /// present in the set will be added to the worklist. If nullptr
   /// is provided, no filtering occurs.
-  void fillWorkLists(MachineBasicBlock *MBB,
+  void fillWorkLists(const MachineBasicBlock *MBB,
                      SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
                      const BlockFilterSet *BlockFilter);
-  void buildChain(MachineBasicBlock *BB, BlockChain &Chain,
+  void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
                   BlockFilterSet *BlockFilter = nullptr);
-  MachineBasicBlock *findBestLoopTop(MachineLoop &L,
-                                     const BlockFilterSet &LoopBlockSet);
-  MachineBasicBlock *findBestLoopExit(MachineLoop &L,
-                                      const BlockFilterSet &LoopBlockSet);
-  BlockFilterSet collectLoopBlockSet(MachineLoop &L);
-  void buildLoopChains(MachineLoop &L);
-  void rotateLoop(BlockChain &LoopChain, MachineBasicBlock *ExitingBB,
-                  const BlockFilterSet &LoopBlockSet);
-  void rotateLoopWithProfile(BlockChain &LoopChain, MachineLoop &L,
-                             const BlockFilterSet &LoopBlockSet);
-  void collectMustExecuteBBs();
+  MachineBasicBlock *findBestLoopTop(
+      const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
+  MachineBasicBlock *findBestLoopExit(
+      const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
+  BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
+  void buildLoopChains(const MachineLoop &L);
+  void rotateLoop(
+      BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
+      const BlockFilterSet &LoopBlockSet);
+  void rotateLoopWithProfile(
+      BlockChain &LoopChain, const MachineLoop &L,
+      const BlockFilterSet &LoopBlockSet);
   void buildCFGChains();
   void optimizeBranches();
   void alignBlocks();
+  /// Returns true if a block should be tail-duplicated to increase fallthrough
+  /// opportunities.
+  bool shouldTailDuplicate(MachineBasicBlock *BB);
+  /// Check the edge frequencies to see if tail duplication will increase
+  /// fallthroughs.
+  bool isProfitableToTailDup(
+    const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
+    BranchProbability AdjustedSumProb,
+    const BlockChain &Chain, const BlockFilterSet *BlockFilter);
+  /// Check for a trellis layout.
+  bool isTrellis(const MachineBasicBlock *BB,
+                 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+                 const BlockChain &Chain, const BlockFilterSet *BlockFilter);
+  /// Get the best successor given a trellis layout.
+  BlockAndTailDupResult getBestTrellisSuccessor(
+      const MachineBasicBlock *BB,
+      const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+      BranchProbability AdjustedSumProb, const BlockChain &Chain,
+      const BlockFilterSet *BlockFilter);
+  /// Get the best pair of non-conflicting edges.
+  static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
+      const MachineBasicBlock *BB,
+      MutableArrayRef<SmallVector<WeightedEdge, 8>> Edges);
+  /// Returns true if a block can tail duplicate into all unplaced
+  /// predecessors. Filters based on loop.
+  bool canTailDuplicateUnplacedPreds(
+      const MachineBasicBlock *BB, MachineBasicBlock *Succ,
+      const BlockChain &Chain, const BlockFilterSet *BlockFilter);
+  /// Find chains of triangles to tail-duplicate where a global analysis works,
+  /// but a local analysis would not find them.
+  void precomputeTriangleChains();
 
 public:
   static char ID; // Pass identification, replacement for typeid
@@ -415,7 +479,8 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineBranchProbabilityInfo>();
     AU.addRequired<MachineBlockFrequencyInfo>();
-    AU.addRequired<MachineDominatorTree>();
+    if (TailDupPlacement)
+      AU.addRequired<MachinePostDominatorTree>();
     AU.addRequired<MachineLoopInfo>();
     AU.addRequired<TargetPassConfig>();
     MachineFunctionPass::getAnalysisUsage(AU);
@@ -429,7 +494,7 @@ INITIALIZE_PASS_BEGIN(MachineBlockPlacement, "block-placement",
                       "Branch Probability Basic Block Placement", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_END(MachineBlockPlacement, "block-placement",
                     "Branch Probability Basic Block Placement", false, false)
@@ -438,7 +503,7 @@ INITIALIZE_PASS_END(MachineBlockPlacement, "block-placement",
 /// \brief Helper to print the name of a MBB.
 ///
 /// Only used by debug logging.
-static std::string getBlockName(MachineBasicBlock *BB) {
+static std::string getBlockName(const MachineBasicBlock *BB) {
   std::string Result;
   raw_string_ostream OS(Result);
   OS << "BB#" << BB->getNumber();
@@ -455,7 +520,7 @@ static std::string getBlockName(MachineBasicBlock *BB) {
 /// having one fewer active predecessor. It also adds any successors of this
 /// chain which reach the zero-predecessor state to the appropriate worklist.
 void MachineBlockPlacement::markChainSuccessors(
-    BlockChain &Chain, MachineBasicBlock *LoopHeaderBB,
+    const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
     const BlockFilterSet *BlockFilter) {
   // Walk all the blocks in this chain, marking their successors as having
   // a predecessor placed.
@@ -471,8 +536,8 @@ void MachineBlockPlacement::markChainSuccessors(
 /// and was duplicated into the chain end, we need to redo markBlockSuccessors
 /// for just that block.
 void MachineBlockPlacement::markBlockSuccessors(
-    BlockChain &Chain, MachineBasicBlock *MBB, MachineBasicBlock *LoopHeaderBB,
-    const BlockFilterSet *BlockFilter) {
+    const BlockChain &Chain, const MachineBasicBlock *MBB,
+    const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) {
   // Add any successors for which this is the only un-placed in-loop
   // predecessor to the worklist as a viable candidate for CFG-neutral
   // placement. No subsequent placement of this block will violate the CFG
@@ -504,7 +569,8 @@ void MachineBlockPlacement::markBlockSuccessors(
 /// the total branch probability of edges from \p BB to those
 /// blocks.
 BranchProbability MachineBlockPlacement::collectViableSuccessors(
-    MachineBasicBlock *BB, BlockChain &Chain, const BlockFilterSet *BlockFilter,
+    const MachineBasicBlock *BB, const BlockChain &Chain,
+    const BlockFilterSet *BlockFilter,
     SmallVector<MachineBasicBlock *, 4> &Successors) {
   // Adjust edge probabilities by excluding edges pointing to blocks that is
   // either not in BlockFilter or is already in the current chain. Consider the
@@ -561,46 +627,573 @@ getAdjustedProbability(BranchProbability OrigProb,
   return SuccProb;
 }
 
-/// When the option OutlineOptionalBranches is on, this method
-/// checks if the fallthrough candidate block \p Succ (of block
-/// \p BB) also has other unscheduled predecessor blocks which
-/// are also successors of \p BB (forming triangular shape CFG).
-/// If none of such predecessors are small, it returns true.
-/// The caller can choose to select \p Succ as the layout successors
-/// so that \p Succ's predecessors (optional branches) can be
-/// outlined.
-/// FIXME: fold this with more general layout cost analysis.
-bool MachineBlockPlacement::shouldPredBlockBeOutlined(
-    MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &Chain,
-    const BlockFilterSet *BlockFilter, BranchProbability SuccProb,
-    BranchProbability HotProb) {
-  if (!OutlineOptionalBranches)
+/// Check if \p BB has exactly the successors in \p Successors.
+static bool
+hasSameSuccessors(MachineBasicBlock &BB,
+                  SmallPtrSetImpl<const MachineBasicBlock *> &Successors) {
+  if (BB.succ_size() != Successors.size())
     return false;
-  // If we outline optional branches, look whether Succ is unavoidable, i.e.
-  // dominates all terminators of the MachineFunction. If it does, other
-  // successors must be optional. Don't do this for cold branches.
-  if (SuccProb > HotProb.getCompl() && UnavoidableBlocks.count(Succ) > 0) {
-    for (MachineBasicBlock *Pred : Succ->predecessors()) {
-      // Check whether there is an unplaced optional branch.
-      if (Pred == Succ || (BlockFilter && !BlockFilter->count(Pred)) ||
-          BlockToChain[Pred] == &Chain)
+  // We don't want to count self-loops
+  if (Successors.count(&BB))
+    return false;
+  for (MachineBasicBlock *Succ : BB.successors())
+    if (!Successors.count(Succ))
+      return false;
+  return true;
+}
+
+/// Check if a block should be tail duplicated to increase fallthrough
+/// opportunities.
+/// \p BB Block to check.
+bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
+  // Blocks with single successors don't create additional fallthrough
+  // opportunities. Don't duplicate them. TODO: When conditional exits are
+  // analyzable, allow them to be duplicated.
+  bool IsSimple = TailDup.isSimpleBB(BB);
+
+  if (BB->succ_size() == 1)
+    return false;
+  return TailDup.shouldTailDuplicate(IsSimple, *BB);
+}
+
+/// Compare 2 BlockFrequency's with a small penalty for \p A.
+/// In order to be conservative, we apply a X% penalty to account for
+/// increased icache pressure and static heuristics. For small frequencies
+/// we use only the numerators to improve accuracy. For simplicity, we assume the
+/// penalty is less than 100%
+/// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
+static bool greaterWithBias(BlockFrequency A, BlockFrequency B,
+                            uint64_t EntryFreq) {
+  BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
+  BlockFrequency Gain = A - B;
+  return (Gain / ThresholdProb).getFrequency() >= EntryFreq;
+}
+
+/// Check the edge frequencies to see if tail duplication will increase
+/// fallthroughs. It only makes sense to call this function when
+/// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
+/// always locally profitable if we would have picked \p Succ without
+/// considering duplication.
+bool MachineBlockPlacement::isProfitableToTailDup(
+    const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
+    BranchProbability QProb,
+    const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+  // We need to do a probability calculation to make sure this is profitable.
+  // First: does succ have a successor that post-dominates? This affects the
+  // calculation. The 2 relevant cases are:
+  //    BB         BB
+  //    | \Qout    | \Qout
+  //   P|  C       |P C
+  //    =   C'     =   C'
+  //    |  /Qin    |  /Qin
+  //    | /        | /
+  //    Succ       Succ
+  //    / \        | \  V
+  //  U/   =V      |U \
+  //  /     \      =   D
+  //  D      E     |  /
+  //               | /
+  //               |/
+  //               PDom
+  //  '=' : Branch taken for that CFG edge
+  // In the second case, Placing Succ while duplicating it into C prevents the
+  // fallthrough of Succ into either D or PDom, because they now have C as an
+  // unplaced predecessor
+
+  // Start by figuring out which case we fall into
+  MachineBasicBlock *PDom = nullptr;
+  SmallVector<MachineBasicBlock *, 4> SuccSuccs;
+  // Only scan the relevant successors
+  auto AdjustedSuccSumProb =
+      collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs);
+  BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ);
+  auto BBFreq = MBFI->getBlockFreq(BB);
+  auto SuccFreq = MBFI->getBlockFreq(Succ);
+  BlockFrequency P = BBFreq * PProb;
+  BlockFrequency Qout = BBFreq * QProb;
+  uint64_t EntryFreq = MBFI->getEntryFreq();
+  // If there are no more successors, it is profitable to copy, as it strictly
+  // increases fallthrough.
+  if (SuccSuccs.size() == 0)
+    return greaterWithBias(P, Qout, EntryFreq);
+
+  auto BestSuccSucc = BranchProbability::getZero();
+  // Find the PDom or the best Succ if no PDom exists.
+  for (MachineBasicBlock *SuccSucc : SuccSuccs) {
+    auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
+    if (Prob > BestSuccSucc)
+      BestSuccSucc = Prob;
+    if (PDom == nullptr)
+      if (MPDT->dominates(SuccSucc, Succ)) {
+        PDom = SuccSucc;
+        break;
+      }
+  }
+  // For the comparisons, we need to know Succ's best incoming edge that isn't
+  // from BB.
+  auto SuccBestPred = BlockFrequency(0);
+  for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
+    if (SuccPred == Succ || SuccPred == BB
+        || BlockToChain[SuccPred] == &Chain
+        || (BlockFilter && !BlockFilter->count(SuccPred)))
+      continue;
+    auto Freq = MBFI->getBlockFreq(SuccPred)
+        * MBPI->getEdgeProbability(SuccPred, Succ);
+    if (Freq > SuccBestPred)
+      SuccBestPred = Freq;
+  }
+  // Qin is Succ's best unplaced incoming edge that isn't BB
+  BlockFrequency Qin = SuccBestPred;
+  // If it doesn't have a post-dominating successor, here is the calculation:
+  //    BB        BB
+  //    | \Qout   |  \
+  //   P|  C      |   =
+  //    =   C'    |    C
+  //    |  /Qin   |     |
+  //    | /       |     C' (+Succ)
+  //    Succ      Succ /|
+  //    / \       |  \/ |
+  //  U/   =V     |  == |
+  //  /     \     | /  \|
+  //  D      E    D     E
+  //  '=' : Branch taken for that CFG edge
+  //  Cost in the first case is: P + V
+  //  For this calculation, we always assume P > Qout. If Qout > P
+  //  The result of this function will be ignored at the caller.
+  //  Let F = SuccFreq - Qin
+  //  Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V
+
+  if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
+    BranchProbability UProb = BestSuccSucc;
+    BranchProbability VProb = AdjustedSuccSumProb - UProb;
+    BlockFrequency F = SuccFreq - Qin;
+    BlockFrequency V = SuccFreq * VProb;
+    BlockFrequency QinU = std::min(Qin, F) * UProb;
+    BlockFrequency BaseCost = P + V;
+    BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb;
+    return greaterWithBias(BaseCost, DupCost, EntryFreq);
+  }
+  BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
+  BranchProbability VProb = AdjustedSuccSumProb - UProb;
+  BlockFrequency U = SuccFreq * UProb;
+  BlockFrequency V = SuccFreq * VProb;
+  BlockFrequency F = SuccFreq - Qin;
+  // If there is a post-dominating successor, here is the calculation:
+  // BB         BB                 BB          BB
+  // | \Qout    |   \               | \Qout     |  \
+  // |P C       |    =              |P C        |   =
+  // =   C'     |P    C             =   C'      |P   C
+  // |  /Qin    |      |            |  /Qin     |     |
+  // | /        |      C' (+Succ)   | /         |     C' (+Succ)
+  // Succ       Succ  /|            Succ        Succ /|
+  // | \  V     |   \/ |            | \  V      |  \/ |
+  // |U \       |U  /\ =?           |U =        |U /\ |
+  // =   D      = =  =?|            |   D       | =  =|
+  // |  /       |/     D            |  /        |/    D
+  // | /        |     /             | =         |    /
+  // |/         |    /              |/          |   =
+  // Dom         Dom                Dom         Dom
+  //  '=' : Branch taken for that CFG edge
+  // The cost for taken branches in the first case is P + U
+  // Let F = SuccFreq - Qin
+  // The cost in the second case (assuming independence), given the layout:
+  // BB, Succ, (C+Succ), D, Dom or the layout:
+  // BB, Succ, D, Dom, (C+Succ)
+  // is Qout + max(F, Qin) * U + min(F, Qin)
+  // compare P + U vs Qout + P * U + Qin.
+  //
+  // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
+  //
+  // For the 3rd case, the cost is P + 2 * V
+  // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V
+  // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V
+  if (UProb > AdjustedSuccSumProb / 2 &&
+      !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
+                                  Chain, BlockFilter))
+    // Cases 3 & 4
+    return greaterWithBias(
+        (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb),
+        EntryFreq);
+  // Cases 1 & 2
+  return greaterWithBias((P + U),
+                         (Qout + std::min(Qin, F) * AdjustedSuccSumProb +
+                          std::max(Qin, F) * UProb),
+                         EntryFreq);
+}
+
+/// Check for a trellis layout. \p BB is the upper part of a trellis if its
+/// successors form the lower part of a trellis. A successor set S forms the
+/// lower part of a trellis if all of the predecessors of S are either in S or
+/// have all of S as successors. We ignore trellises where BB doesn't have 2
+/// successors because for fewer than 2, it's trivial, and for 3 or greater they
+/// are very uncommon and complex to compute optimally. Allowing edges within S
+/// is not strictly a trellis, but the same algorithm works, so we allow it.
+bool MachineBlockPlacement::isTrellis(
+    const MachineBasicBlock *BB,
+    const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+    const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+  // Technically BB could form a trellis with branching factor higher than 2.
+  // But that's extremely uncommon.
+  if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
+    return false;
+
+  SmallPtrSet<const MachineBasicBlock *, 2> Successors(BB->succ_begin(),
+                                                       BB->succ_end());
+  // To avoid reviewing the same predecessors twice.
+  SmallPtrSet<const MachineBasicBlock *, 8> SeenPreds;
+
+  for (MachineBasicBlock *Succ : ViableSuccs) {
+    int PredCount = 0;
+    for (auto SuccPred : Succ->predecessors()) {
+      // Allow triangle successors, but don't count them.
+      if (Successors.count(SuccPred)) {
+        // Make sure that it is actually a triangle.
+        for (MachineBasicBlock *CheckSucc : SuccPred->successors())
+          if (!Successors.count(CheckSucc))
+            return false;
         continue;
-      // Check whether the optional branch has exactly one BB.
-      if (Pred->pred_size() > 1 || *Pred->pred_begin() != BB)
+      }
+      const BlockChain *PredChain = BlockToChain[SuccPred];
+      if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
+          PredChain == &Chain || PredChain == BlockToChain[Succ])
         continue;
-      // Check whether the optional branch is small.
-      if (Pred->size() < OutlineOptionalThreshold)
+      ++PredCount;
+      // Perform the successor check only once.
+      if (!SeenPreds.insert(SuccPred).second)
+        continue;
+      if (!hasSameSuccessors(*SuccPred, Successors))
         return false;
     }
-    return true;
-  } else
+    // If one of the successors has only BB as a predecessor, it is not a
+    // trellis.
+    if (PredCount < 1)
+      return false;
+  }
+  return true;
+}
+
+/// Pick the highest total weight pair of edges that can both be laid out.
+/// The edges in \p Edges[0] are assumed to have a different destination than
+/// the edges in \p Edges[1]. Simple counting shows that the best pair is either
+/// the individual highest weight edges to the 2 different destinations, or in
+/// case of a conflict, one of them should be replaced with a 2nd best edge.
+std::pair<MachineBlockPlacement::WeightedEdge,
+          MachineBlockPlacement::WeightedEdge>
+MachineBlockPlacement::getBestNonConflictingEdges(
+    const MachineBasicBlock *BB,
+    MutableArrayRef<SmallVector<MachineBlockPlacement::WeightedEdge, 8>>
+        Edges) {
+  // Sort the edges, and then for each successor, find the best incoming
+  // predecessor. If the best incoming predecessors aren't the same,
+  // then that is clearly the best layout. If there is a conflict, one of the
+  // successors will have to fallthrough from the second best predecessor. We
+  // compare which combination is better overall.
+
+  // Sort for highest frequency.
+  auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
+
+  std::stable_sort(Edges[0].begin(), Edges[0].end(), Cmp);
+  std::stable_sort(Edges[1].begin(), Edges[1].end(), Cmp);
+  auto BestA = Edges[0].begin();
+  auto BestB = Edges[1].begin();
+  // Arrange for the correct answer to be in BestA and BestB
+  // If the 2 best edges don't conflict, the answer is already there.
+  if (BestA->Src == BestB->Src) {
+    // Compare the total fallthrough of (Best + Second Best) for both pairs
+    auto SecondBestA = std::next(BestA);
+    auto SecondBestB = std::next(BestB);
+    BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
+    BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
+    if (BestAScore < BestBScore)
+      BestA = SecondBestA;
+    else
+      BestB = SecondBestB;
+  }
+  // Arrange for the BB edge to be in BestA if it exists.
+  if (BestB->Src == BB)
+    std::swap(BestA, BestB);
+  return std::make_pair(*BestA, *BestB);
+}
+
+/// Get the best successor from \p BB based on \p BB being part of a trellis.
+/// We only handle trellises with 2 successors, so the algorithm is
+/// straightforward: Find the best pair of edges that don't conflict. We find
+/// the best incoming edge for each successor in the trellis. If those conflict,
+/// we consider which of them should be replaced with the second best.
+/// Upon return the two best edges will be in \p BestEdges. If one of the edges
+/// comes from \p BB, it will be in \p BestEdges[0]
+MachineBlockPlacement::BlockAndTailDupResult
+MachineBlockPlacement::getBestTrellisSuccessor(
+    const MachineBasicBlock *BB,
+    const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+    BranchProbability AdjustedSumProb, const BlockChain &Chain,
+    const BlockFilterSet *BlockFilter) {
+
+  BlockAndTailDupResult Result = {nullptr, false};
+  SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(),
+                                                       BB->succ_end());
+
+  // We assume size 2 because it's common. For general n, we would have to do
+  // the Hungarian algorithm, but it's not worth the complexity because more
+  // than 2 successors is fairly uncommon, and a trellis even more so.
+  if (Successors.size() != 2 || ViableSuccs.size() != 2)
+    return Result;
+
+  // Collect the edge frequencies of all edges that form the trellis.
+  SmallVector<WeightedEdge, 8> Edges[2];
+  int SuccIndex = 0;
+  for (auto Succ : ViableSuccs) {
+    for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
+      // Skip any placed predecessors that are not BB
+      if (SuccPred != BB)
+        if ((BlockFilter && !BlockFilter->count(SuccPred)) ||
+            BlockToChain[SuccPred] == &Chain ||
+            BlockToChain[SuccPred] == BlockToChain[Succ])
+          continue;
+      BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
+                                MBPI->getEdgeProbability(SuccPred, Succ);
+      Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
+    }
+    ++SuccIndex;
+  }
+
+  // Pick the best combination of 2 edges from all the edges in the trellis.
+  WeightedEdge BestA, BestB;
+  std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
+
+  if (BestA.Src != BB) {
+    // If we have a trellis, and BB doesn't have the best fallthrough edges,
+    // we shouldn't choose any successor. We've already looked and there's a
+    // better fallthrough edge for all the successors.
+    DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
+    return Result;
+  }
+
+  // Did we pick the triangle edge? If tail-duplication is profitable, do
+  // that instead. Otherwise merge the triangle edge now while we know it is
+  // optimal.
+  if (BestA.Dest == BestB.Src) {
+    // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
+    // would be better.
+    MachineBasicBlock *Succ1 = BestA.Dest;
+    MachineBasicBlock *Succ2 = BestB.Dest;
+    // Check to see if tail-duplication would be profitable.
+    if (TailDupPlacement && shouldTailDuplicate(Succ2) &&
+        canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
+        isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
+                              Chain, BlockFilter)) {
+      DEBUG(BranchProbability Succ2Prob = getAdjustedProbability(
+                MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
+            dbgs() << "    Selected: " << getBlockName(Succ2)
+                   << ", probability: " << Succ2Prob << " (Tail Duplicate)\n");
+      Result.BB = Succ2;
+      Result.ShouldTailDup = true;
+      return Result;
+    }
+  }
+  // We have already computed the optimal edge for the other side of the
+  // trellis.
+  ComputedEdges[BestB.Src] = { BestB.Dest, false };
+
+  auto TrellisSucc = BestA.Dest;
+  DEBUG(BranchProbability SuccProb = getAdjustedProbability(
+            MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
+        dbgs() << "    Selected: " << getBlockName(TrellisSucc)
+               << ", probability: " << SuccProb << " (Trellis)\n");
+  Result.BB = TrellisSucc;
+  return Result;
+}
+
+/// When the option TailDupPlacement is on, this method checks if the
+/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
+/// into all of its unplaced, unfiltered predecessors, that are not BB.
+bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
+    const MachineBasicBlock *BB, MachineBasicBlock *Succ,
+    const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+  if (!shouldTailDuplicate(Succ))
     return false;
+
+  // For CFG checking.
+  SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(),
+                                                       BB->succ_end());
+  for (MachineBasicBlock *Pred : Succ->predecessors()) {
+    // Make sure all unplaced and unfiltered predecessors can be
+    // tail-duplicated into.
+    // Skip any blocks that are already placed or not in this loop.
+    if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
+        || BlockToChain[Pred] == &Chain)
+      continue;
+    if (!TailDup.canTailDuplicate(Succ, Pred)) {
+      if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
+        // This will result in a trellis after tail duplication, so we don't
+        // need to copy Succ into this predecessor. In the presence
+        // of a trellis tail duplication can continue to be profitable.
+        // For example:
+        // A            A
+        // |\           |\
+        // | \          | \
+        // |  C         |  C+BB
+        // | /          |  |
+        // |/           |  |
+        // BB    =>     BB |
+        // |\           |\/|
+        // | \          |/\|
+        // |  D         |  D
+        // | /          | /
+        // |/           |/
+        // Succ         Succ
+        //
+        // After BB was duplicated into C, the layout looks like the one on the
+        // right. BB and C now have the same successors. When considering
+        // whether Succ can be duplicated into all its unplaced predecessors, we
+        // ignore C.
+        // We can do this because C already has a profitable fallthrough, namely
+        // D. TODO(iteratee): ignore sufficiently cold predecessors for
+        // duplication and for this test.
+        //
+        // This allows trellises to be laid out in 2 separate chains
+        // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
+        // because it allows the creation of 2 fallthrough paths with links
+        // between them, and we correctly identify the best layout for these
+        // CFGs. We want to extend trellises that the user created in addition
+        // to trellises created by tail-duplication, so we just look for the
+        // CFG.
+        continue;
+      return false;
+    }
+  }
+  return true;
+}
+
+/// Find chains of triangles where we believe it would be profitable to
+/// tail-duplicate them all, but a local analysis would not find them.
+/// There are 3 ways this can be profitable:
+/// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
+///    longer chains)
+/// 2) The chains are statically correlated. Branch probabilities have a very
+///    U-shaped distribution.
+///    [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
+///    If the branches in a chain are likely to be from the same side of the
+///    distribution as their predecessor, but are independent at runtime, this
+///    transformation is profitable. (Because the cost of being wrong is a small
+///    fixed cost, unlike the standard triangle layout where the cost of being
+///    wrong scales with the # of triangles.)
+/// 3) The chains are dynamically correlated. If the probability that a previous
+///    branch was taken positively influences whether the next branch will be
+///    taken
+/// We believe that 2 and 3 are common enough to justify the small margin in 1.
+void MachineBlockPlacement::precomputeTriangleChains() {
+  struct TriangleChain {
+    std::vector<MachineBasicBlock *> Edges;
+    TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst)
+        : Edges({src, dst}) {}
+
+    void append(MachineBasicBlock *dst) {
+      assert(getKey()->isSuccessor(dst) &&
+             "Attempting to append a block that is not a successor.");
+      Edges.push_back(dst);
+    }
+
+    unsigned count() const { return Edges.size() - 1; }
+
+    MachineBasicBlock *getKey() const {
+      return Edges.back();
+    }
+  };
+
+  if (TriangleChainCount == 0)
+    return;
+
+  DEBUG(dbgs() << "Pre-computing triangle chains.\n");
+  // Map from last block to the chain that contains it. This allows us to extend
+  // chains as we find new triangles.
+  DenseMap<const MachineBasicBlock *, TriangleChain> TriangleChainMap;
+  for (MachineBasicBlock &BB : *F) {
+    // If BB doesn't have 2 successors, it doesn't start a triangle.
+    if (BB.succ_size() != 2)
+      continue;
+    MachineBasicBlock *PDom = nullptr;
+    for (MachineBasicBlock *Succ : BB.successors()) {
+      if (!MPDT->dominates(Succ, &BB))
+        continue;
+      PDom = Succ;
+      break;
+    }
+    // If BB doesn't have a post-dominating successor, it doesn't form a
+    // triangle.
+    if (PDom == nullptr)
+      continue;
+    // If PDom has a hint that it is low probability, skip this triangle.
+    if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100))
+      continue;
+    // If PDom isn't eligible for duplication, this isn't the kind of triangle
+    // we're looking for.
+    if (!shouldTailDuplicate(PDom))
+      continue;
+    bool CanTailDuplicate = true;
+    // If PDom can't tail-duplicate into it's non-BB predecessors, then this
+    // isn't the kind of triangle we're looking for.
+    for (MachineBasicBlock* Pred : PDom->predecessors()) {
+      if (Pred == &BB)
+        continue;
+      if (!TailDup.canTailDuplicate(PDom, Pred)) {
+        CanTailDuplicate = false;
+        break;
+      }
+    }
+    // If we can't tail-duplicate PDom to its predecessors, then skip this
+    // triangle.
+    if (!CanTailDuplicate)
+      continue;
+
+    // Now we have an interesting triangle. Insert it if it's not part of an
+    // existing chain
+    // Note: This cannot be replaced with a call insert() or emplace() because
+    // the find key is BB, but the insert/emplace key is PDom.
+    auto Found = TriangleChainMap.find(&BB);
+    // If it is, remove the chain from the map, grow it, and put it back in the
+    // map with the end as the new key.
+    if (Found != TriangleChainMap.end()) {
+      TriangleChain Chain = std::move(Found->second);
+      TriangleChainMap.erase(Found);
+      Chain.append(PDom);
+      TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain)));
+    } else {
+      auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom);
+      assert(InsertResult.second && "Block seen twice.");
+      (void)InsertResult;
+    }
+  }
+
+  // Iterating over a DenseMap is safe here, because the only thing in the body
+  // of the loop is inserting into another DenseMap (ComputedEdges).
+  // ComputedEdges is never iterated, so this doesn't lead to non-determinism.
+  for (auto &ChainPair : TriangleChainMap) {
+    TriangleChain &Chain = ChainPair.second;
+    // Benchmarking has shown that due to branch correlation duplicating 2 or
+    // more triangles is profitable, despite the calculations assuming
+    // independence.
+    if (Chain.count() < TriangleChainCount)
+      continue;
+    MachineBasicBlock *dst = Chain.Edges.back();
+    Chain.Edges.pop_back();
+    for (MachineBasicBlock *src : reverse(Chain.Edges)) {
+      DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->" <<
+            getBlockName(dst) << " as pre-computed based on triangles.\n");
+
+      auto InsertResult = ComputedEdges.insert({src, {dst, true}});
+      assert(InsertResult.second && "Block seen twice.");
+      (void)InsertResult;
+
+      dst = src;
+    }
+  }
 }
 
 // When profile is not present, return the StaticLikelyProb.
 // When profile is available, we need to handle the triangle-shape CFG.
 static BranchProbability getLayoutSuccessorProbThreshold(
-      MachineBasicBlock *BB) {
+      const MachineBasicBlock *BB) {
   if (!BB->getParent()->getFunction()->getEntryCount())
     return BranchProbability(StaticLikelyProb, 100);
   if (BB->succ_size() == 2) {
@@ -609,11 +1202,11 @@ static BranchProbability getLayoutSuccessorProbThreshold(
     if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
       /* See case 1 below for the cost analysis. For BB->Succ to
        * be taken with smaller cost, the following needs to hold:
-       *   Prob(BB->Succ) > 2* Prob(BB->Pred)
-       *   So the threshold T
-       *   T = 2 * (1-Prob(BB->Pred). Since T + Prob(BB->Pred) == 1,
-       * We have  T + T/2 = 1, i.e. T = 2/3. Also adding user specified
-       * branch bias, we have
+       *   Prob(BB->Succ) > 2 * Prob(BB->Pred)
+       *   So the threshold T in the calculation below
+       *   (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
+       *   So T / (1 - T) = 2, Yielding T = 2/3
+       * Also adding user specified branch bias, we have
        *   T = (2/3)*(ProfileLikelyProb/50)
        *     = (2*ProfileLikelyProb)/150)
        */
@@ -625,10 +1218,17 @@ static BranchProbability getLayoutSuccessorProbThreshold(
 
 /// Checks to see if the layout candidate block \p Succ has a better layout
 /// predecessor than \c BB. If yes, returns true.
+/// \p SuccProb: The probability adjusted for only remaining blocks.
+///   Only used for logging
+/// \p RealSuccProb: The un-adjusted probability.
+/// \p Chain: The chain that BB belongs to and Succ is being considered for.
+/// \p BlockFilter: if non-null, the set of blocks that make up the loop being
+///    considered
 bool MachineBlockPlacement::hasBetterLayoutPredecessor(
-    MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &SuccChain,
-    BranchProbability SuccProb, BranchProbability RealSuccProb,
-    BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+    const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
+    const BlockChain &SuccChain, BranchProbability SuccProb,
+    BranchProbability RealSuccProb, const BlockChain &Chain,
+    const BlockFilterSet *BlockFilter) {
 
   // There isn't a better layout when there are no unscheduled predecessors.
   if (SuccChain.UnscheduledPredecessors == 0)
@@ -734,11 +1334,12 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(
   //  |  Pred----|                     |  S1----
   //  |  |                             |       |
   //  --(S1 or S2)                     ---Pred--
+  //                                        |
+  //                                       S2
   //
   // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
   //    + min(freq(Pred->S1), freq(Pred->S2))
   // Non-topo-order cost:
-  // In the worst case, S2 will not get laid out after Pred.
   // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
   // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
   // is 0. Then the non topo layout is better when
@@ -756,13 +1357,15 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(
   for (MachineBasicBlock *Pred : Succ->predecessors()) {
     if (Pred == Succ || BlockToChain[Pred] == &SuccChain ||
         (BlockFilter && !BlockFilter->count(Pred)) ||
-        BlockToChain[Pred] == &Chain)
+        BlockToChain[Pred] == &Chain ||
+        // This check is redundant except for look ahead. This function is
+        // called for lookahead by isProfitableToTailDup when BB hasn't been
+        // placed yet.
+        (Pred == BB))
       continue;
     // Do backward checking.
     // For all cases above, we need a backward checking to filter out edges that
-    // are not 'strongly' biased. With profile data available, the check is
-    // mostly redundant for case 2 (when threshold prob is set at 50%) unless S
-    // has more than two successors.
+    // are not 'strongly' biased.
     // BB  Pred
     //  \ /
     //  Succ
@@ -798,14 +1401,15 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(
 /// breaking CFG structure, but cave and break such structures in the case of
 /// very hot successor edges.
 ///
-/// \returns The best successor block found, or null if none are viable.
-MachineBasicBlock *
-MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
-                                           BlockChain &Chain,
-                                           const BlockFilterSet *BlockFilter) {
+/// \returns The best successor block found, or null if none are viable, along
+/// with a boolean indicating if tail duplication is necessary.
+MachineBlockPlacement::BlockAndTailDupResult
+MachineBlockPlacement::selectBestSuccessor(
+    const MachineBasicBlock *BB, const BlockChain &Chain,
+    const BlockFilterSet *BlockFilter) {
   const BranchProbability HotProb(StaticLikelyProb, 100);
 
-  MachineBasicBlock *BestSucc = nullptr;
+  BlockAndTailDupResult BestSucc = { nullptr, false };
   auto BestProb = BranchProbability::getZero();
 
   SmallVector<MachineBasicBlock *, 4> Successors;
@@ -813,22 +1417,45 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
       collectViableSuccessors(BB, Chain, BlockFilter, Successors);
 
   DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB) << "\n");
+
+  // if we already precomputed the best successor for BB, return that if still
+  // applicable.
+  auto FoundEdge = ComputedEdges.find(BB);
+  if (FoundEdge != ComputedEdges.end()) {
+    MachineBasicBlock *Succ = FoundEdge->second.BB;
+    ComputedEdges.erase(FoundEdge);
+    BlockChain *SuccChain = BlockToChain[Succ];
+    if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
+        SuccChain != &Chain && Succ == *SuccChain->begin())
+      return FoundEdge->second;
+  }
+
+  // if BB is part of a trellis, Use the trellis to determine the optimal
+  // fallthrough edges
+  if (isTrellis(BB, Successors, Chain, BlockFilter))
+    return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
+                                   BlockFilter);
+
+  // For blocks with CFG violations, we may be able to lay them out anyway with
+  // tail-duplication. We keep this vector so we can perform the probability
+  // calculations the minimum number of times.
+  SmallVector<std::tuple<BranchProbability, MachineBasicBlock *>, 4>
+      DupCandidates;
   for (MachineBasicBlock *Succ : Successors) {
     auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
     BranchProbability SuccProb =
         getAdjustedProbability(RealSuccProb, AdjustedSumProb);
 
-    // This heuristic is off by default.
-    if (shouldPredBlockBeOutlined(BB, Succ, Chain, BlockFilter, SuccProb,
-                                  HotProb))
-      return Succ;
-
     BlockChain &SuccChain = *BlockToChain[Succ];
     // Skip the edge \c BB->Succ if block \c Succ has a better layout
     // predecessor that yields lower global cost.
     if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
-                                   Chain, BlockFilter))
+                                   Chain, BlockFilter)) {
+      // If tail duplication would make Succ profitable, place it.
+      if (TailDupPlacement && shouldTailDuplicate(Succ))
+        DupCandidates.push_back(std::make_tuple(SuccProb, Succ));
       continue;
+    }
 
     DEBUG(
         dbgs() << "    Candidate: " << getBlockName(Succ) << ", probability: "
@@ -836,17 +1463,48 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
                << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
                << "\n");
 
-    if (BestSucc && BestProb >= SuccProb) {
+    if (BestSucc.BB && BestProb >= SuccProb) {
       DEBUG(dbgs() << "    Not the best candidate, continuing\n");
       continue;
     }
 
     DEBUG(dbgs() << "    Setting it as best candidate\n");
-    BestSucc = Succ;
+    BestSucc.BB = Succ;
     BestProb = SuccProb;
   }
-  if (BestSucc)
-    DEBUG(dbgs() << "    Selected: " << getBlockName(BestSucc) << "\n");
+  // Handle the tail duplication candidates in order of decreasing probability.
+  // Stop at the first one that is profitable. Also stop if they are less
+  // profitable than BestSucc. Position is important because we preserve it and
+  // prefer first best match. Here we aren't comparing in order, so we capture
+  // the position instead.
+  if (DupCandidates.size() != 0) {
+    auto cmp =
+        [](const std::tuple<BranchProbability, MachineBasicBlock *> &a,
+           const std::tuple<BranchProbability, MachineBasicBlock *> &b) {
+          return std::get<0>(a) > std::get<0>(b);
+        };
+    std::stable_sort(DupCandidates.begin(), DupCandidates.end(), cmp);
+  }
+  for(auto &Tup : DupCandidates) {
+    BranchProbability DupProb;
+    MachineBasicBlock *Succ;
+    std::tie(DupProb, Succ) = Tup;
+    if (DupProb < BestProb)
+      break;
+    if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter)
+        && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
+      DEBUG(
+          dbgs() << "    Candidate: " << getBlockName(Succ) << ", probability: "
+                 << DupProb
+                 << " (Tail Duplicate)\n");
+      BestSucc.BB = Succ;
+      BestSucc.ShouldTailDup = true;
+      break;
+    }
+  }
+
+  if (BestSucc.BB)
+    DEBUG(dbgs() << "    Selected: " << getBlockName(BestSucc.BB) << "\n");
 
   return BestSucc;
 }
@@ -862,7 +1520,7 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
 ///
 /// \returns The best block found, or null if none are viable.
 MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
-    BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
+    const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
   // Once we need to walk the worklist looking for a candidate, cleanup the
   // worklist of already placed entries.
   // FIXME: If this shows up on profiles, it could be folded (at the cost of
@@ -948,7 +1606,7 @@ MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
 }
 
 void MachineBlockPlacement::fillWorkLists(
-    MachineBasicBlock *MBB,
+    const MachineBasicBlock *MBB,
     SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
     const BlockFilterSet *BlockFilter = nullptr) {
   BlockChain &Chain = *BlockToChain[MBB];
@@ -970,23 +1628,23 @@ void MachineBlockPlacement::fillWorkLists(
   if (Chain.UnscheduledPredecessors != 0)
     return;
 
-  MBB = *Chain.begin();
-  if (MBB->isEHPad())
-    EHPadWorkList.push_back(MBB);
+  MachineBasicBlock *BB = *Chain.begin();
+  if (BB->isEHPad())
+    EHPadWorkList.push_back(BB);
   else
-    BlockWorkList.push_back(MBB);
+    BlockWorkList.push_back(BB);
 }
 
 void MachineBlockPlacement::buildChain(
-    MachineBasicBlock *BB, BlockChain &Chain,
+    const MachineBasicBlock *HeadBB, BlockChain &Chain,
     BlockFilterSet *BlockFilter) {
-  assert(BB && "BB must not be null.\n");
-  assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match.\n");
+  assert(HeadBB && "BB must not be null.\n");
+  assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
   MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
 
-  MachineBasicBlock *LoopHeaderBB = BB;
+  const MachineBasicBlock *LoopHeaderBB = HeadBB;
   markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
-  BB = *std::prev(Chain.end());
+  MachineBasicBlock *BB = *std::prev(Chain.end());
   for (;;) {
     assert(BB && "null block found at end of chain in loop.");
     assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
@@ -995,7 +1653,11 @@ void MachineBlockPlacement::buildChain(
 
     // Look for the best viable successor if there is one to place immediately
     // after this block.
-    MachineBasicBlock *BestSucc = selectBestSuccessor(BB, Chain, BlockFilter);
+    auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
+    MachineBasicBlock* BestSucc = Result.BB;
+    bool ShouldTailDup = Result.ShouldTailDup;
+    if (TailDupPlacement)
+      ShouldTailDup |= (BestSucc && shouldTailDuplicate(BestSucc));
 
     // If an immediate successor isn't available, look for the best viable
     // block among those we've identified as not violating the loop's CFG at
@@ -1016,7 +1678,7 @@ void MachineBlockPlacement::buildChain(
 
     // Placement may have changed tail duplication opportunities.
     // Check for that now.
-    if (TailDupPlacement && BestSucc) {
+    if (TailDupPlacement && BestSucc && ShouldTailDup) {
       // If the chosen successor was duplicated into all its predecessors,
       // don't bother laying it out, just go round the loop again with BB as
       // the chain end.
@@ -1052,7 +1714,7 @@ void MachineBlockPlacement::buildChain(
 /// unconditional jump (for the backedge) rotating it in front of the loop
 /// header is always profitable.
 MachineBasicBlock *
-MachineBlockPlacement::findBestLoopTop(MachineLoop &L,
+MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
                                        const BlockFilterSet &LoopBlockSet) {
   // Placing the latch block before the header may introduce an extra branch
   // that skips this block the first time the loop is executed, which we want
@@ -1116,7 +1778,7 @@ MachineBlockPlacement::findBestLoopTop(MachineLoop &L,
 /// block to layout at the top of the loop. Typically this is done to maximize
 /// fallthrough opportunities.
 MachineBasicBlock *
-MachineBlockPlacement::findBestLoopExit(MachineLoop &L,
+MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
                                         const BlockFilterSet &LoopBlockSet) {
   // We don't want to layout the loop linearly in all cases. If the loop header
   // is just a normal basic block in the loop, we want to look for what block
@@ -1235,7 +1897,7 @@ MachineBlockPlacement::findBestLoopExit(MachineLoop &L,
 /// branches. For example, if the loop has fallthrough into its header and out
 /// of its bottom already, don't rotate it.
 void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
-                                       MachineBasicBlock *ExitingBB,
+                                       const MachineBasicBlock *ExitingBB,
                                        const BlockFilterSet &LoopBlockSet) {
   if (!ExitingBB)
     return;
@@ -1285,7 +1947,8 @@ void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
 ///  Therefore, the cost for a given rotation is the sum of costs listed above.
 ///  We select the best rotation with the smallest cost.
 void MachineBlockPlacement::rotateLoopWithProfile(
-    BlockChain &LoopChain, MachineLoop &L, const BlockFilterSet &LoopBlockSet) {
+    BlockChain &LoopChain, const MachineLoop &L,
+    const BlockFilterSet &LoopBlockSet) {
   auto HeaderBB = L.getHeader();
   auto HeaderIter = find(LoopChain, HeaderBB);
   auto RotationPos = LoopChain.end();
@@ -1422,7 +2085,7 @@ void MachineBlockPlacement::rotateLoopWithProfile(
 /// When profile data is available, exclude cold blocks from the returned set;
 /// otherwise, collect all blocks in the loop.
 MachineBlockPlacement::BlockFilterSet
-MachineBlockPlacement::collectLoopBlockSet(MachineLoop &L) {
+MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
   BlockFilterSet LoopBlockSet;
 
   // Filter cold blocks off from LoopBlockSet when profile data is available.
@@ -1459,10 +2122,10 @@ MachineBlockPlacement::collectLoopBlockSet(MachineLoop &L) {
 /// as much as possible. We can then stitch the chains together in a way which
 /// both preserves the topological structure and minimizes taken conditional
 /// branches.
-void MachineBlockPlacement::buildLoopChains(MachineLoop &L) {
+void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
   // First recurse through any nested loops, building chains for those inner
   // loops.
-  for (MachineLoop *InnerLoop : L)
+  for (const MachineLoop *InnerLoop : L)
     buildLoopChains(*InnerLoop);
 
   assert(BlockWorkList.empty());
@@ -1499,7 +2162,7 @@ void MachineBlockPlacement::buildLoopChains(MachineLoop &L) {
   assert(LoopChain.UnscheduledPredecessors == 0);
   UpdatedPreds.insert(&LoopChain);
 
-  for (MachineBasicBlock *LoopBB : LoopBlockSet)
+  for (const MachineBasicBlock *LoopBB : LoopBlockSet)
     fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
 
   buildChain(LoopTop, LoopChain, &LoopBlockSet);
@@ -1533,7 +2196,7 @@ void MachineBlockPlacement::buildLoopChains(MachineLoop &L) {
 
     if (!LoopBlockSet.empty()) {
       BadLoop = true;
-      for (MachineBasicBlock *LoopBB : LoopBlockSet)
+      for (const MachineBasicBlock *LoopBB : LoopBlockSet)
         dbgs() << "Loop contains blocks never placed into a chain!\n"
                << "  Loop header:  " << getBlockName(*L.block_begin()) << "\n"
                << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
@@ -1546,31 +2209,6 @@ void MachineBlockPlacement::buildLoopChains(MachineLoop &L) {
   EHPadWorkList.clear();
 }
 
-/// When OutlineOpitonalBranches is on, this method collects BBs that
-/// dominates all terminator blocks of the function \p F.
-void MachineBlockPlacement::collectMustExecuteBBs() {
-  if (OutlineOptionalBranches) {
-    // Find the nearest common dominator of all of F's terminators.
-    MachineBasicBlock *Terminator = nullptr;
-    for (MachineBasicBlock &MBB : *F) {
-      if (MBB.succ_size() == 0) {
-        if (Terminator == nullptr)
-          Terminator = &MBB;
-        else
-          Terminator = MDT->findNearestCommonDominator(Terminator, &MBB);
-      }
-    }
-
-    // MBBs dominating this common dominator are unavoidable.
-    UnavoidableBlocks.clear();
-    for (MachineBasicBlock &MBB : *F) {
-      if (MDT->dominates(&MBB, Terminator)) {
-        UnavoidableBlocks.insert(&MBB);
-      }
-    }
-  }
-}
-
 void MachineBlockPlacement::buildCFGChains() {
   // Ensure that every BB in the function has an associated chain to simplify
   // the assumptions of the remaining algorithm.
@@ -1605,9 +2243,6 @@ void MachineBlockPlacement::buildCFGChains() {
     }
   }
 
-  // Turned on with OutlineOptionalBranches option
-  collectMustExecuteBBs();
-
   // Build any loop-based chains.
   PreferredLoopExit = nullptr;
   for (MachineLoop *L : *MLI)
@@ -1839,7 +2474,7 @@ void MachineBlockPlacement::alignBlocks() {
 /// @return true if \p BB was removed.
 bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
     MachineBasicBlock *BB, MachineBasicBlock *&LPred,
-    MachineBasicBlock *LoopHeaderBB,
+    const MachineBasicBlock *LoopHeaderBB,
     BlockChain &Chain, BlockFilterSet *BlockFilter,
     MachineFunction::iterator &PrevUnplacedBlockIt) {
   bool Removed, DuplicatedToLPred;
@@ -1901,21 +2536,16 @@ bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
 /// \return  - True if the block was duplicated into all preds and removed.
 bool MachineBlockPlacement::maybeTailDuplicateBlock(
     MachineBasicBlock *BB, MachineBasicBlock *LPred,
-    const BlockChain &Chain, BlockFilterSet *BlockFilter,
+    BlockChain &Chain, BlockFilterSet *BlockFilter,
     MachineFunction::iterator &PrevUnplacedBlockIt,
     bool &DuplicatedToLPred) {
-
   DuplicatedToLPred = false;
+  if (!shouldTailDuplicate(BB))
+    return false;
+
   DEBUG(dbgs() << "Redoing tail duplication for Succ#"
         << BB->getNumber() << "\n");
-  bool IsSimple = TailDup.isSimpleBB(BB);
-  // Blocks with single successors don't create additional fallthrough
-  // opportunities. Don't duplicate them. TODO: When conditional exits are
-  // analyzable, allow them to be duplicated.
-  if (!IsSimple && BB->succ_size() == 1)
-    return false;
-  if (!TailDup.shouldTailDuplicate(IsSimple, *BB))
-    return false;
+
   // This has to be a callback because none of it can be done after
   // BB is deleted.
   bool Removed = false;
@@ -1967,6 +2597,7 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock(
       llvm::function_ref<void(MachineBasicBlock*)>(RemovalCallback);
 
   SmallVector<MachineBasicBlock *, 8> DuplicatedPreds;
+  bool IsSimple = TailDup.isSimpleBB(BB);
   TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred,
                                  &DuplicatedPreds, &RemovalCallbackRef);
 
@@ -2006,21 +2637,24 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   MLI = &getAnalysis<MachineLoopInfo>();
   TII = MF.getSubtarget().getInstrInfo();
   TLI = MF.getSubtarget().getTargetLowering();
-  MDT = &getAnalysis<MachineDominatorTree>();
+  MPDT = nullptr;
 
   // Initialize PreferredLoopExit to nullptr here since it may never be set if
   // there are no MachineLoops.
   PreferredLoopExit = nullptr;
 
+  assert(BlockToChain.empty());
+  assert(ComputedEdges.empty());
+
   if (TailDupPlacement) {
-    unsigned TailDupSize = TailDuplicatePlacementThreshold;
+    MPDT = &getAnalysis<MachinePostDominatorTree>();
+    unsigned TailDupSize = TailDupPlacementThreshold;
     if (MF.getFunction()->optForSize())
       TailDupSize = 1;
     TailDup.initMF(MF, MBPI, /* LayoutMode */ true, TailDupSize);
+    precomputeTriangleChains();
   }
 
-  assert(BlockToChain.empty());
-
   buildCFGChains();
 
   // Changing the layout can create new tail merging opportunities.
@@ -2032,7 +2666,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
                          BranchFoldPlacement;
   // No tail merging opportunities if the block number is less than four.
   if (MF.size() > 3 && EnableTailMerge) {
-    unsigned TailMergeSize = TailDuplicatePlacementThreshold + 1;
+    unsigned TailMergeSize = TailDupPlacementThreshold + 1;
     BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
                     *MBPI, TailMergeSize);
 
@@ -2041,8 +2675,10 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
                             /*AfterBlockPlacement=*/true)) {
       // Redo the layout if tail merging creates/removes/moves blocks.
       BlockToChain.clear();
-      // Must redo the dominator tree if blocks were changed.
-      MDT->runOnMachineFunction(MF);
+      ComputedEdges.clear();
+      // Must redo the post-dominator tree if blocks were changed.
+      if (MPDT)
+        MPDT->runOnMachineFunction(MF);
       ChainAllocator.DestroyAll();
       buildCFGChains();
     }
@@ -2052,6 +2688,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   alignBlocks();
 
   BlockToChain.clear();
+  ComputedEdges.clear();
   ChainAllocator.DestroyAll();
 
   if (AlignAllBlock)
@@ -2067,6 +2704,12 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
         MBI->setAlignment(AlignAllNonFallThruBlocks);
     }
   }
+  if (ViewBlockLayoutWithBFI != GVDT_None &&
+      (ViewBlockFreqFuncName.empty() ||
+       F->getFunction()->getName().equals(ViewBlockFreqFuncName))) {
+    MBFI->view("MBP." + MF.getName(), false);
+  }
+
 
   // We always return true as we have no way to track whether the final order
   // differs from the original order.