13 files changed, 626 insertions, 140 deletions
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 6469c899feea..d6d6b1a7fa09 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -235,22 +235,26 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
   // These dominator edges will be redirected from Pred.
   std::vector<DominatorTree::UpdateType> Updates;
   if (DTU) {
-    SmallPtrSet<BasicBlock *, 2> SuccsOfBB(succ_begin(BB), succ_end(BB));
+    // To avoid processing the same predecessor more than once.
+    SmallPtrSet<BasicBlock *, 8> SeenSuccs;
     SmallPtrSet<BasicBlock *, 2> SuccsOfPredBB(succ_begin(PredBB),
                                                succ_end(PredBB));
-    Updates.reserve(Updates.size() + 2 * SuccsOfBB.size() + 1);
+    Updates.reserve(Updates.size() + 2 * succ_size(BB) + 1);
     // Add insert edges first. Experimentally, for the particular case of two
     // blocks that can be merged, with a single successor and single predecessor
     // respectively, it is beneficial to have all insert updates first. Deleting
     // edges first may lead to unreachable blocks, followed by inserting edges
     // making the blocks reachable again. Such DT updates lead to high compile
     // times. We add inserts before deletes here to reduce compile time.
-    for (BasicBlock *SuccOfBB : SuccsOfBB)
+    for (BasicBlock *SuccOfBB : successors(BB))
       // This successor of BB may already be a PredBB's successor.
       if (!SuccsOfPredBB.contains(SuccOfBB))
-        Updates.push_back({DominatorTree::Insert, PredBB, SuccOfBB});
-    for (BasicBlock *SuccOfBB : SuccsOfBB)
-      Updates.push_back({DominatorTree::Delete, BB, SuccOfBB});
+        if (SeenSuccs.insert(SuccOfBB).second)
+          Updates.push_back({DominatorTree::Insert, PredBB, SuccOfBB});
+    SeenSuccs.clear();
+    for (BasicBlock *SuccOfBB : successors(BB))
+      if (SeenSuccs.insert(SuccOfBB).second)
+        Updates.push_back({DominatorTree::Delete, BB, SuccOfBB});
     Updates.push_back({DominatorTree::Delete, PredBB, BB});
   }
 
@@ -804,14 +808,14 @@ static BasicBlock *SplitBlockImpl(BasicBlock *Old, Instruction *SplitPt,
   if (DTU) {
     SmallVector<DominatorTree::UpdateType, 8> Updates;
     // Old dominates New. New node dominates all other nodes dominated by Old.
-    SmallPtrSet<BasicBlock *, 8> UniqueSuccessorsOfOld(succ_begin(New),
-                                                       succ_end(New));
+    SmallPtrSet<BasicBlock *, 8> UniqueSuccessorsOfOld;
     Updates.push_back({DominatorTree::Insert, Old, New});
-    Updates.reserve(Updates.size() + 2 * UniqueSuccessorsOfOld.size());
-    for (BasicBlock *UniqueSuccessorOfOld : UniqueSuccessorsOfOld) {
-      Updates.push_back({DominatorTree::Insert, New, UniqueSuccessorOfOld});
-      Updates.push_back({DominatorTree::Delete, Old, UniqueSuccessorOfOld});
-    }
+    Updates.reserve(Updates.size() + 2 * succ_size(New));
+    for (BasicBlock *SuccessorOfOld : successors(New))
+      if (UniqueSuccessorsOfOld.insert(SuccessorOfOld).second) {
+        Updates.push_back({DominatorTree::Insert, New, SuccessorOfOld});
+        Updates.push_back({DominatorTree::Delete, Old, SuccessorOfOld});
+      }
 
     DTU->applyUpdates(Updates);
   } else if (DT)
@@ -870,14 +874,14 @@ BasicBlock *llvm::splitBlockBefore(BasicBlock *Old, Instruction *SplitPt,
     SmallVector<DominatorTree::UpdateType, 8> DTUpdates;
     // New dominates Old. The predecessor nodes of the Old node dominate
     // New node.
-    SmallPtrSet<BasicBlock *, 8> UniquePredecessorsOfOld(pred_begin(New),
-                                                         pred_end(New));
+    SmallPtrSet<BasicBlock *, 8> UniquePredecessorsOfOld;
     DTUpdates.push_back({DominatorTree::Insert, New, Old});
-    DTUpdates.reserve(DTUpdates.size() + 2 * UniquePredecessorsOfOld.size());
-    for (BasicBlock *UniquePredecessorOfOld : UniquePredecessorsOfOld) {
-      DTUpdates.push_back({DominatorTree::Insert, UniquePredecessorOfOld, New});
-      DTUpdates.push_back({DominatorTree::Delete, UniquePredecessorOfOld, Old});
-    }
+    DTUpdates.reserve(DTUpdates.size() + 2 * pred_size(New));
+    for (BasicBlock *PredecessorOfOld : predecessors(New))
+      if (UniquePredecessorsOfOld.insert(PredecessorOfOld).second) {
+        DTUpdates.push_back({DominatorTree::Insert, PredecessorOfOld, New});
+        DTUpdates.push_back({DominatorTree::Delete, PredecessorOfOld, Old});
+      }
 
     DTU->applyUpdates(DTUpdates);
 
@@ -910,13 +914,14 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
     } else {
       // Split block expects NewBB to have a non-empty set of predecessors.
       SmallVector<DominatorTree::UpdateType, 8> Updates;
-      SmallPtrSet<BasicBlock *, 8> UniquePreds(Preds.begin(), Preds.end());
+      SmallPtrSet<BasicBlock *, 8> UniquePreds;
       Updates.push_back({DominatorTree::Insert, NewBB, OldBB});
-      Updates.reserve(Updates.size() + 2 * UniquePreds.size());
-      for (auto *UniquePred : UniquePreds) {
-        Updates.push_back({DominatorTree::Insert, UniquePred, NewBB});
-        Updates.push_back({DominatorTree::Delete, UniquePred, OldBB});
-      }
+      Updates.reserve(Updates.size() + 2 * Preds.size());
+      for (auto *Pred : Preds)
+        if (UniquePreds.insert(Pred).second) {
+          Updates.push_back({DominatorTree::Insert, Pred, NewBB});
+          Updates.push_back({DominatorTree::Delete, Pred, OldBB});
+        }
       DTU->applyUpdates(Updates);
     }
   } else if (DT) {
@@ -1376,14 +1381,14 @@ SplitBlockAndInsertIfThenImpl(Value *Cond, Instruction *SplitBefore,
   BasicBlock *Head = SplitBefore->getParent();
   BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator());
   if (DTU) {
-    SmallPtrSet<BasicBlock *, 8> UniqueSuccessorsOfHead(succ_begin(Tail),
-                                                        succ_end(Tail));
+    SmallPtrSet<BasicBlock *, 8> UniqueSuccessorsOfHead;
     Updates.push_back({DominatorTree::Insert, Head, Tail});
-    Updates.reserve(Updates.size() + 2 * UniqueSuccessorsOfHead.size());
-    for (BasicBlock *UniqueSuccessorOfHead : UniqueSuccessorsOfHead) {
-      Updates.push_back({DominatorTree::Insert, Tail, UniqueSuccessorOfHead});
-      Updates.push_back({DominatorTree::Delete, Head, UniqueSuccessorOfHead});
-    }
+    Updates.reserve(Updates.size() + 2 * succ_size(Tail));
+    for (BasicBlock *SuccessorOfHead : successors(Tail))
+      if (UniqueSuccessorsOfHead.insert(SuccessorOfHead).second) {
+        Updates.push_back({DominatorTree::Insert, Tail, SuccessorOfHead});
+        Updates.push_back({DominatorTree::Delete, Head, SuccessorOfHead});
+      }
   }
   Instruction *HeadOldTerm = Head->getTerminator();
   LLVMContext &C = Head->getContext();
diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index 957935398972..580cfd80141e 100644
--- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -452,18 +452,17 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     return Changed;
   case LibFunc_mempcpy:
   case LibFunc_memccpy:
+    Changed |= setWillReturn(F);
+    LLVM_FALLTHROUGH;
+  case LibFunc_memcpy_chk:
     Changed |= setDoesNotThrow(F);
     Changed |= setOnlyAccessesArgMemory(F);
-    Changed |= setWillReturn(F);
     Changed |= setDoesNotAlias(F, 0);
     Changed |= setOnlyWritesMemory(F, 0);
     Changed |= setDoesNotAlias(F, 1);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc_memcpy_chk:
-    Changed |= setDoesNotThrow(F);
-    return Changed;
   case LibFunc_memalign:
     Changed |= setOnlyAccessesInaccessibleMemory(F);
     Changed |= setRetNoUndef(F);
@@ -1018,9 +1017,8 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  // TODO: add LibFunc entries for:
-  // case LibFunc_memset_pattern4:
-  // case LibFunc_memset_pattern8:
+  case LibFunc_memset_pattern4:
+  case LibFunc_memset_pattern8:
   case LibFunc_memset_pattern16:
     Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setDoesNotCapture(F, 0);
@@ -1029,10 +1027,12 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   case LibFunc_memset:
-    Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setWillReturn(F);
-    Changed |= setDoesNotThrow(F);
+    LLVM_FALLTHROUGH;
+  case LibFunc_memset_chk:
+    Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setOnlyWritesMemory(F, 0);
+    Changed |= setDoesNotThrow(F);
     return Changed;
   // int __nvvm_reflect(const char *)
   case LibFunc_nvvm_reflect:
diff --git a/llvm/lib/Transforms/Utils/CloneModule.cpp b/llvm/lib/Transforms/Utils/CloneModule.cpp
index 200deca4b317..57c273a0e3c5 100644
--- a/llvm/lib/Transforms/Utils/CloneModule.cpp
+++ b/llvm/lib/Transforms/Utils/CloneModule.cpp
@@ -135,10 +135,18 @@ std::unique_ptr<Module> llvm::CloneModule(
   // Similarly, copy over function bodies now...
   //
   for (const Function &I : M) {
-    if (I.isDeclaration())
+    Function *F = cast<Function>(VMap[&I]);
+
+    if (I.isDeclaration()) {
+      // Copy over metadata for declarations since we're not doing it below in
+      // CloneFunctionInto().
+      SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+      I.getAllMetadata(MDs);
+      for (auto MD : MDs)
+        F->addMetadata(MD.first, *MapMetadata(MD.second, VMap));
       continue;
+    }
 
-    Function *F = cast<Function>(VMap[&I]);
     if (!ShouldCloneDefinition(&I)) {
       // Skip after setting the correct linkage for an external reference.
       F->setLinkage(GlobalValue::ExternalLinkage);
diff --git a/llvm/lib/Transforms/Utils/GuardUtils.cpp b/llvm/lib/Transforms/Utils/GuardUtils.cpp
index 4dbcbf80d3da..7c310f16d46e 100644
--- a/llvm/lib/Transforms/Utils/GuardUtils.cpp
+++ b/llvm/lib/Transforms/Utils/GuardUtils.cpp
@@ -74,7 +74,7 @@ void llvm::makeGuardControlFlowExplicit(Function *DeoptIntrinsic,
                                  {}, {}, nullptr, "widenable_cond");
     CheckBI->setCondition(B.CreateAnd(CheckBI->getCondition(), WC,
                                       "exiplicit_guard_cond"));
-    assert(isWidenableBranch(CheckBI) && "sanity check");
+    assert(isWidenableBranch(CheckBI) && "Branch must be widenable.");
   }
 }
 
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index f4776589910f..997667810580 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1218,10 +1218,9 @@ static void AddReturnAttributes(CallBase &CB, ValueToValueMapTy &VMap) {
     if (!RI || !isa<CallBase>(RI->getOperand(0)))
       continue;
     auto *RetVal = cast<CallBase>(RI->getOperand(0));
-    // Sanity check that the cloned RetVal exists and is a call, otherwise we
-    // cannot add the attributes on the cloned RetVal.
-    // Simplification during inlining could have transformed the cloned
-    // instruction.
+    // Check that the cloned RetVal exists and is a call, otherwise we cannot
+    // add the attributes on the cloned RetVal. Simplification during inlining
+    // could have transformed the cloned instruction.
     auto *NewRetVal = dyn_cast_or_null<CallBase>(VMap.lookup(RetVal));
     if (!NewRetVal)
       continue;
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 74ab37fadf36..ec926b1f5a94 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -529,8 +529,8 @@ bool llvm::RecursivelyDeleteTriviallyDeadInstructionsPermissive(
     std::function<void(Value *)> AboutToDeleteCallback) {
   unsigned S = 0, E = DeadInsts.size(), Alive = 0;
   for (; S != E; ++S) {
-    auto *I = cast<Instruction>(DeadInsts[S]);
-    if (!isInstructionTriviallyDead(I)) {
+    auto *I = dyn_cast<Instruction>(DeadInsts[S]);
+    if (!I || !isInstructionTriviallyDead(I)) {
       DeadInsts[S] = nullptr;
       ++Alive;
     }
@@ -760,15 +760,18 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB,
   SmallVector<DominatorTree::UpdateType, 32> Updates;
 
   if (DTU) {
-    SmallPtrSet<BasicBlock *, 2> PredsOfPredBB(pred_begin(PredBB),
-                                               pred_end(PredBB));
-    Updates.reserve(Updates.size() + 2 * PredsOfPredBB.size() + 1);
-    for (BasicBlock *PredOfPredBB : PredsOfPredBB)
+    // To avoid processing the same predecessor more than once.
+    SmallPtrSet<BasicBlock *, 2> SeenPreds;
+    Updates.reserve(Updates.size() + 2 * pred_size(PredBB) + 1);
+    for (BasicBlock *PredOfPredBB : predecessors(PredBB))
       // This predecessor of PredBB may already have DestBB as a successor.
       if (PredOfPredBB != PredBB)
-        Updates.push_back({DominatorTree::Insert, PredOfPredBB, DestBB});
-    for (BasicBlock *PredOfPredBB : PredsOfPredBB)
-      Updates.push_back({DominatorTree::Delete, PredOfPredBB, PredBB});
+        if (SeenPreds.insert(PredOfPredBB).second)
+          Updates.push_back({DominatorTree::Insert, PredOfPredBB, DestBB});
+    SeenPreds.clear();
+    for (BasicBlock *PredOfPredBB : predecessors(PredBB))
+      if (SeenPreds.insert(PredOfPredBB).second)
+        Updates.push_back({DominatorTree::Delete, PredOfPredBB, PredBB});
     Updates.push_back({DominatorTree::Delete, PredBB, DestBB});
   }
 
@@ -1096,16 +1099,20 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
 
   SmallVector<DominatorTree::UpdateType, 32> Updates;
   if (DTU) {
+    // To avoid processing the same predecessor more than once.
+    SmallPtrSet<BasicBlock *, 8> SeenPreds;
     // All predecessors of BB will be moved to Succ.
-    SmallPtrSet<BasicBlock *, 8> PredsOfBB(pred_begin(BB), pred_end(BB));
     SmallPtrSet<BasicBlock *, 8> PredsOfSucc(pred_begin(Succ), pred_end(Succ));
-    Updates.reserve(Updates.size() + 2 * PredsOfBB.size() + 1);
-    for (auto *PredOfBB : PredsOfBB)
+    Updates.reserve(Updates.size() + 2 * pred_size(BB) + 1);
+    for (auto *PredOfBB : predecessors(BB))
       // This predecessor of BB may already have Succ as a successor.
       if (!PredsOfSucc.contains(PredOfBB))
-        Updates.push_back({DominatorTree::Insert, PredOfBB, Succ});
-    for (auto *PredOfBB : PredsOfBB)
-      Updates.push_back({DominatorTree::Delete, PredOfBB, BB});
+        if (SeenPreds.insert(PredOfBB).second)
+          Updates.push_back({DominatorTree::Insert, PredOfBB, Succ});
+    SeenPreds.clear();
+    for (auto *PredOfBB : predecessors(BB))
+      if (SeenPreds.insert(PredOfBB).second)
+        Updates.push_back({DominatorTree::Delete, PredOfBB, BB});
     Updates.push_back({DominatorTree::Delete, BB, Succ});
   }
 
@@ -2190,26 +2197,6 @@ void llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) {
     DTU->applyUpdates({{DominatorTree::Delete, BB, UnwindDestBB}});
 }
 
-void llvm::createUnreachableSwitchDefault(SwitchInst *Switch,
-                                          DomTreeUpdater *DTU) {
-  LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
-  auto *BB = Switch->getParent();
-  auto *OrigDefaultBlock = Switch->getDefaultDest();
-  OrigDefaultBlock->removePredecessor(BB);
-  BasicBlock *NewDefaultBlock = BasicBlock::Create(
-      BB->getContext(), BB->getName() + ".unreachabledefault", BB->getParent(),
-      OrigDefaultBlock);
-  new UnreachableInst(Switch->getContext(), NewDefaultBlock);
-  Switch->setDefaultDest(&*NewDefaultBlock);
-  if (DTU) {
-    SmallVector<DominatorTree::UpdateType, 2> Updates;
-    Updates.push_back({DominatorTree::Insert, BB, &*NewDefaultBlock});
-    if (!is_contained(successors(BB), OrigDefaultBlock))
-      Updates.push_back({DominatorTree::Delete, BB, &*OrigDefaultBlock});
-    DTU->applyUpdates(Updates);
-  }
-}
-
 BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI,
                                                    BasicBlock *UnwindEdge,
                                                    DomTreeUpdater *DTU) {
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index a92cb6a313d3..bb719a499a4c 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -623,15 +623,13 @@ bool llvm::UnrollRuntimeLoopRemainder(
   if (!SE)
     return false;
 
-  // Only unroll loops with a computable trip count, and the trip count needs
-  // to be an int value (allowing a pointer type is a TODO item).
+  // Only unroll loops with a computable trip count.
   // We calculate the backedge count by using getExitCount on the Latch block,
   // which is proven to be the only exiting block in this loop. This is same as
   // calculating getBackedgeTakenCount on the loop (which computes SCEV for all
   // exiting blocks).
   const SCEV *BECountSC = SE->getExitCount(L, Latch);
-  if (isa<SCEVCouldNotCompute>(BECountSC) ||
-      !BECountSC->getType()->isIntegerTy()) {
+  if (isa<SCEVCouldNotCompute>(BECountSC)) {
     LLVM_DEBUG(dbgs() << "Could not compute exit block SCEV\n");
     return false;
   }
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 68572d479742..c8e42acdffb3 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1049,6 +1049,7 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder,
     return Builder.CreateOrReduce(Src);
   case RecurKind::Xor:
     return Builder.CreateXorReduce(Src);
+  case RecurKind::FMulAdd:
   case RecurKind::FAdd:
     return Builder.CreateFAddReduce(ConstantFP::getNegativeZero(SrcVecEltTy),
                                     Src);
@@ -1091,7 +1092,8 @@ Value *llvm::createTargetReduction(IRBuilderBase &B,
 Value *llvm::createOrderedReduction(IRBuilderBase &B,
                                     const RecurrenceDescriptor &Desc,
                                     Value *Src, Value *Start) {
-  assert(Desc.getRecurrenceKind() == RecurKind::FAdd &&
+  assert((Desc.getRecurrenceKind() == RecurKind::FAdd ||
+          Desc.getRecurrenceKind() == RecurKind::FMulAdd) &&
          "Unexpected reduction kind");
   assert(Src->getType()->isVectorTy() && "Expected a vector type");
   assert(!Start->getType()->isVectorTy() && "Expected a scalar type");
diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
index 5893ce15b129..7d9992176658 100644
--- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
@@ -446,6 +446,9 @@ void LoadAndStorePromoter::run(const SmallVectorImpl<Instruction *> &Insts) {
   // Now that everything is rewritten, delete the old instructions from the
   // function.  They should all be dead now.
   for (Instruction *User : Insts) {
+    if (!shouldDelete(User))
+      continue;
+
     // If this is a load that still has uses, then the load must have been added
     // as a live value in the SSAUpdate data structure for a block (e.g. because
     // the loaded value was stored later).  In this case, we need to recursively
diff --git a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
new file mode 100644
index 000000000000..9495e442e0bf
--- /dev/null
+++ b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
@@ -0,0 +1,462 @@
+//===- SampleProfileInference.cpp - Adjust sample profiles in the IR ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a profile inference algorithm. Given an incomplete and
+// possibly imprecise block counts, the algorithm reconstructs realistic block
+// and edge counts that satisfy flow conservation rules, while minimally modify
+// input block counts.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SampleProfileInference.h"
+#include "llvm/Support/Debug.h"
+#include <queue>
+#include <set>
+
+using namespace llvm;
+#define DEBUG_TYPE "sample-profile-inference"
+
+namespace {
+
+/// A value indicating an infinite flow/capacity/weight of a block/edge.
+/// Not using numeric_limits<int64_t>::max(), as the values can be summed up
+/// during the execution.
+static constexpr int64_t INF = ((int64_t)1) << 50;
+
+/// The minimum-cost maximum flow algorithm.
+///
+/// The algorithm finds the maximum flow of minimum cost on a given (directed)
+/// network using a modified version of the classical Moore-Bellman-Ford
+/// approach. The algorithm applies a number of augmentation iterations in which
+/// flow is sent along paths of positive capacity from the source to the sink.
+/// The worst-case time complexity of the implementation is O(v(f)*m*n), where
+/// where m is the number of edges, n is the number of vertices, and v(f) is the
+/// value of the maximum flow. However, the observed running time on typical
+/// instances is sub-quadratic, that is, o(n^2).
+///
+/// The input is a set of edges with specified costs and capacities, and a pair
+/// of nodes (source and sink). The output is the flow along each edge of the
+/// minimum total cost respecting the given edge capacities.
+class MinCostMaxFlow {
+public:
+  // Initialize algorithm's data structures for a network of a given size.
+  void initialize(uint64_t NodeCount, uint64_t SourceNode, uint64_t SinkNode) {
+    Source = SourceNode;
+    Target = SinkNode;
+
+    Nodes = std::vector<Node>(NodeCount);
+    Edges = std::vector<std::vector<Edge>>(NodeCount, std::vector<Edge>());
+  }
+
+  // Run the algorithm.
+  int64_t run() {
+    // Find an augmenting path and update the flow along the path
+    size_t AugmentationIters = 0;
+    while (findAugmentingPath()) {
+      augmentFlowAlongPath();
+      AugmentationIters++;
+    }
+
+    // Compute the total flow and its cost
+    int64_t TotalCost = 0;
+    int64_t TotalFlow = 0;
+    for (uint64_t Src = 0; Src < Nodes.size(); Src++) {
+      for (auto &Edge : Edges[Src]) {
+        if (Edge.Flow > 0) {
+          TotalCost += Edge.Cost * Edge.Flow;
+          if (Src == Source)
+            TotalFlow += Edge.Flow;
+        }
+      }
+    }
+    LLVM_DEBUG(dbgs() << "Completed profi after " << AugmentationIters
+                      << " iterations with " << TotalFlow << " total flow"
+                      << " of " << TotalCost << " cost\n");
+    (void)TotalFlow;
+    return TotalCost;
+  }
+
+  /// Adding an edge to the network with a specified capacity and a cost.
+  /// Multiple edges between a pair of nodes are allowed but self-edges
+  /// are not supported.
+  void addEdge(uint64_t Src, uint64_t Dst, int64_t Capacity, int64_t Cost) {
+    assert(Capacity > 0 && "adding an edge of zero capacity");
+    assert(Src != Dst && "loop edge are not supported");
+
+    Edge SrcEdge;
+    SrcEdge.Dst = Dst;
+    SrcEdge.Cost = Cost;
+    SrcEdge.Capacity = Capacity;
+    SrcEdge.Flow = 0;
+    SrcEdge.RevEdgeIndex = Edges[Dst].size();
+
+    Edge DstEdge;
+    DstEdge.Dst = Src;
+    DstEdge.Cost = -Cost;
+    DstEdge.Capacity = 0;
+    DstEdge.Flow = 0;
+    DstEdge.RevEdgeIndex = Edges[Src].size();
+
+    Edges[Src].push_back(SrcEdge);
+    Edges[Dst].push_back(DstEdge);
+  }
+
+  /// Adding an edge to the network of infinite capacity and a given cost.
+  void addEdge(uint64_t Src, uint64_t Dst, int64_t Cost) {
+    addEdge(Src, Dst, INF, Cost);
+  }
+
+  /// Get the total flow from a given source node.
+  /// Returns a list of pairs (target node, amount of flow to the target).
+  const std::vector<std::pair<uint64_t, int64_t>> getFlow(uint64_t Src) const {
+    std::vector<std::pair<uint64_t, int64_t>> Flow;
+    for (auto &Edge : Edges[Src]) {
+      if (Edge.Flow > 0)
+        Flow.push_back(std::make_pair(Edge.Dst, Edge.Flow));
+    }
+    return Flow;
+  }
+
+  /// Get the total flow between a pair of nodes.
+  int64_t getFlow(uint64_t Src, uint64_t Dst) const {
+    int64_t Flow = 0;
+    for (auto &Edge : Edges[Src]) {
+      if (Edge.Dst == Dst) {
+        Flow += Edge.Flow;
+      }
+    }
+    return Flow;
+  }
+
+  /// A cost of increasing a block's count by one.
+  static constexpr int64_t AuxCostInc = 10;
+  /// A cost of decreasing a block's count by one.
+  static constexpr int64_t AuxCostDec = 20;
+  /// A cost of increasing a count of zero-weight block by one.
+  static constexpr int64_t AuxCostIncZero = 11;
+  /// A cost of increasing the entry block's count by one.
+  static constexpr int64_t AuxCostIncEntry = 40;
+  /// A cost of decreasing the entry block's count by one.
+  static constexpr int64_t AuxCostDecEntry = 10;
+  /// A cost of taking an unlikely jump.
+  static constexpr int64_t AuxCostUnlikely = ((int64_t)1) << 20;
+
+private:
+  /// Check for existence of an augmenting path with a positive capacity.
+  bool findAugmentingPath() {
+    // Initialize data structures
+    for (auto &Node : Nodes) {
+      Node.Distance = INF;
+      Node.ParentNode = uint64_t(-1);
+      Node.ParentEdgeIndex = uint64_t(-1);
+      Node.Taken = false;
+    }
+
+    std::queue<uint64_t> Queue;
+    Queue.push(Source);
+    Nodes[Source].Distance = 0;
+    Nodes[Source].Taken = true;
+    while (!Queue.empty()) {
+      uint64_t Src = Queue.front();
+      Queue.pop();
+      Nodes[Src].Taken = false;
+      // Although the residual network contains edges with negative costs
+      // (in particular, backward edges), it can be shown that there are no
+      // negative-weight cycles and the following two invariants are maintained:
+      // (i) Dist[Source, V] >= 0 and (ii) Dist[V, Target] >= 0 for all nodes V,
+      // where Dist is the length of the shortest path between two nodes. This
+      // allows to prune the search-space of the path-finding algorithm using
+      // the following early-stop criteria:
+      // -- If we find a path with zero-distance from Source to Target, stop the
+      //    search, as the path is the shortest since Dist[Source, Target] >= 0;
+      // -- If we have Dist[Source, V] > Dist[Source, Target], then do not
+      //    process node V, as it is guaranteed _not_ to be on a shortest path
+      //    from Source to Target; it follows from inequalities
+      //    Dist[Source, Target] >= Dist[Source, V] + Dist[V, Target]
+      //                         >= Dist[Source, V]
+      if (Nodes[Target].Distance == 0)
+        break;
+      if (Nodes[Src].Distance > Nodes[Target].Distance)
+        continue;
+
+      // Process adjacent edges
+      for (uint64_t EdgeIdx = 0; EdgeIdx < Edges[Src].size(); EdgeIdx++) {
+        auto &Edge = Edges[Src][EdgeIdx];
+        if (Edge.Flow < Edge.Capacity) {
+          uint64_t Dst = Edge.Dst;
+          int64_t NewDistance = Nodes[Src].Distance + Edge.Cost;
+          if (Nodes[Dst].Distance > NewDistance) {
+            // Update the distance and the parent node/edge
+            Nodes[Dst].Distance = NewDistance;
+            Nodes[Dst].ParentNode = Src;
+            Nodes[Dst].ParentEdgeIndex = EdgeIdx;
+            // Add the node to the queue, if it is not there yet
+            if (!Nodes[Dst].Taken) {
+              Queue.push(Dst);
+              Nodes[Dst].Taken = true;
+            }
+          }
+        }
+      }
+    }
+
+    return Nodes[Target].Distance != INF;
+  }
+
+  /// Update the current flow along the augmenting path.
+  void augmentFlowAlongPath() {
+    // Find path capacity
+    int64_t PathCapacity = INF;
+    uint64_t Now = Target;
+    while (Now != Source) {
+      uint64_t Pred = Nodes[Now].ParentNode;
+      auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex];
+      PathCapacity = std::min(PathCapacity, Edge.Capacity - Edge.Flow);
+      Now = Pred;
+    }
+
+    assert(PathCapacity > 0 && "found incorrect augmenting path");
+
+    // Update the flow along the path
+    Now = Target;
+    while (Now != Source) {
+      uint64_t Pred = Nodes[Now].ParentNode;
+      auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex];
+      auto &RevEdge = Edges[Now][Edge.RevEdgeIndex];
+
+      Edge.Flow += PathCapacity;
+      RevEdge.Flow -= PathCapacity;
+
+      Now = Pred;
+    }
+  }
+
+  /// An node in a flow network.
+  struct Node {
+    /// The cost of the cheapest path from the source to the current node.
+    int64_t Distance;
+    /// The node preceding the current one in the path.
+    uint64_t ParentNode;
+    /// The index of the edge between ParentNode and the current node.
+    uint64_t ParentEdgeIndex;
+    /// An indicator of whether the current node is in a queue.
+    bool Taken;
+  };
+  /// An edge in a flow network.
+  struct Edge {
+    /// The cost of the edge.
+    int64_t Cost;
+    /// The capacity of the edge.
+    int64_t Capacity;
+    /// The current flow on the edge.
+    int64_t Flow;
+    /// The destination node of the edge.
+    uint64_t Dst;
+    /// The index of the reverse edge between Dst and the current node.
+    uint64_t RevEdgeIndex;
+  };
+
+  /// The set of network nodes.
+  std::vector<Node> Nodes;
+  /// The set of network edges.
+  std::vector<std::vector<Edge>> Edges;
+  /// Source node of the flow.
+  uint64_t Source;
+  /// Target (sink) node of the flow.
+  uint64_t Target;
+};
+
+/// Initializing flow network for a given function.
+///
+/// Every block is split into three nodes that are responsible for (i) an
+/// incoming flow, (ii) an outgoing flow, and (iii) penalizing an increase or
+/// reduction of the block weight.
+void initializeNetwork(MinCostMaxFlow &Network, FlowFunction &Func) {
+  uint64_t NumBlocks = Func.Blocks.size();
+  assert(NumBlocks > 1 && "Too few blocks in a function");
+  LLVM_DEBUG(dbgs() << "Initializing profi for " << NumBlocks << " blocks\n");
+
+  // Pre-process data: make sure the entry weight is at least 1
+  if (Func.Blocks[Func.Entry].Weight == 0) {
+    Func.Blocks[Func.Entry].Weight = 1;
+  }
+  // Introducing dummy source/sink pairs to allow flow circulation.
+  // The nodes corresponding to blocks of Func have indicies in the range
+  // [0..3 * NumBlocks); the dummy nodes are indexed by the next four values.
+  uint64_t S = 3 * NumBlocks;
+  uint64_t T = S + 1;
+  uint64_t S1 = S + 2;
+  uint64_t T1 = S + 3;
+
+  Network.initialize(3 * NumBlocks + 4, S1, T1);
+
+  // Create three nodes for every block of the function
+  for (uint64_t B = 0; B < NumBlocks; B++) {
+    auto &Block = Func.Blocks[B];
+    assert((!Block.UnknownWeight || Block.Weight == 0 || Block.isEntry()) &&
+           "non-zero weight of a block w/o weight except for an entry");
+
+    // Split every block into two nodes
+    uint64_t Bin = 3 * B;
+    uint64_t Bout = 3 * B + 1;
+    uint64_t Baux = 3 * B + 2;
+    if (Block.Weight > 0) {
+      Network.addEdge(S1, Bout, Block.Weight, 0);
+      Network.addEdge(Bin, T1, Block.Weight, 0);
+    }
+
+    // Edges from S and to T
+    assert((!Block.isEntry() || !Block.isExit()) &&
+           "a block cannot be an entry and an exit");
+    if (Block.isEntry()) {
+      Network.addEdge(S, Bin, 0);
+    } else if (Block.isExit()) {
+      Network.addEdge(Bout, T, 0);
+    }
+
+    // An auxiliary node to allow increase/reduction of block counts:
+    // We assume that decreasing block counts is more expensive than increasing,
+    // and thus, setting separate costs here. In the future we may want to tune
+    // the relative costs so as to maximize the quality of generated profiles.
+    int64_t AuxCostInc = MinCostMaxFlow::AuxCostInc;
+    int64_t AuxCostDec = MinCostMaxFlow::AuxCostDec;
+    if (Block.UnknownWeight) {
+      // Do not penalize changing weights of blocks w/o known profile count
+      AuxCostInc = 0;
+      AuxCostDec = 0;
+    } else {
+      // Increasing the count for "cold" blocks with zero initial count is more
+      // expensive than for "hot" ones
+      if (Block.Weight == 0) {
+        AuxCostInc = MinCostMaxFlow::AuxCostIncZero;
+      }
+      // Modifying the count of the entry block is expensive
+      if (Block.isEntry()) {
+        AuxCostInc = MinCostMaxFlow::AuxCostIncEntry;
+        AuxCostDec = MinCostMaxFlow::AuxCostDecEntry;
+      }
+    }
+    // For blocks with self-edges, do not penalize a reduction of the count,
+    // as all of the increase can be attributed to the self-edge
+    if (Block.HasSelfEdge) {
+      AuxCostDec = 0;
+    }
+
+    Network.addEdge(Bin, Baux, AuxCostInc);
+    Network.addEdge(Baux, Bout, AuxCostInc);
+    if (Block.Weight > 0) {
+      Network.addEdge(Bout, Baux, AuxCostDec);
+      Network.addEdge(Baux, Bin, AuxCostDec);
+    }
+  }
+
+  // Creating edges for every jump
+  for (auto &Jump : Func.Jumps) {
+    uint64_t Src = Jump.Source;
+    uint64_t Dst = Jump.Target;
+    if (Src != Dst) {
+      uint64_t SrcOut = 3 * Src + 1;
+      uint64_t DstIn = 3 * Dst;
+      uint64_t Cost = Jump.IsUnlikely ? MinCostMaxFlow::AuxCostUnlikely : 0;
+      Network.addEdge(SrcOut, DstIn, Cost);
+    }
+  }
+
+  // Make sure we have a valid flow circulation
+  Network.addEdge(T, S, 0);
+}
+
+/// Extract resulting block and edge counts from the flow network.
+void extractWeights(MinCostMaxFlow &Network, FlowFunction &Func) {
+  uint64_t NumBlocks = Func.Blocks.size();
+
+  // Extract resulting block counts
+  for (uint64_t Src = 0; Src < NumBlocks; Src++) {
+    auto &Block = Func.Blocks[Src];
+    uint64_t SrcOut = 3 * Src + 1;
+    int64_t Flow = 0;
+    for (auto &Adj : Network.getFlow(SrcOut)) {
+      uint64_t DstIn = Adj.first;
+      int64_t DstFlow = Adj.second;
+      bool IsAuxNode = (DstIn < 3 * NumBlocks && DstIn % 3 == 2);
+      if (!IsAuxNode || Block.HasSelfEdge) {
+        Flow += DstFlow;
+      }
+    }
+    Block.Flow = Flow;
+    assert(Flow >= 0 && "negative block flow");
+  }
+
+  // Extract resulting jump counts
+  for (auto &Jump : Func.Jumps) {
+    uint64_t Src = Jump.Source;
+    uint64_t Dst = Jump.Target;
+    int64_t Flow = 0;
+    if (Src != Dst) {
+      uint64_t SrcOut = 3 * Src + 1;
+      uint64_t DstIn = 3 * Dst;
+      Flow = Network.getFlow(SrcOut, DstIn);
+    } else {
+      uint64_t SrcOut = 3 * Src + 1;
+      uint64_t SrcAux = 3 * Src + 2;
+      int64_t AuxFlow = Network.getFlow(SrcOut, SrcAux);
+      if (AuxFlow > 0)
+        Flow = AuxFlow;
+    }
+    Jump.Flow = Flow;
+    assert(Flow >= 0 && "negative jump flow");
+  }
+}
+
+#ifndef NDEBUG
+/// Verify that the computed flow values satisfy flow conservation rules
+void verifyWeights(const FlowFunction &Func) {
+  const uint64_t NumBlocks = Func.Blocks.size();
+  auto InFlow = std::vector<uint64_t>(NumBlocks, 0);
+  auto OutFlow = std::vector<uint64_t>(NumBlocks, 0);
+  for (auto &Jump : Func.Jumps) {
+    InFlow[Jump.Target] += Jump.Flow;
+    OutFlow[Jump.Source] += Jump.Flow;
+  }
+
+  uint64_t TotalInFlow = 0;
+  uint64_t TotalOutFlow = 0;
+  for (uint64_t I = 0; I < NumBlocks; I++) {
+    auto &Block = Func.Blocks[I];
+    if (Block.isEntry()) {
+      TotalInFlow += Block.Flow;
+      assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow");
+    } else if (Block.isExit()) {
+      TotalOutFlow += Block.Flow;
+      assert(Block.Flow == InFlow[I] && "incorrectly computed control flow");
+    } else {
+      assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow");
+      assert(Block.Flow == InFlow[I] && "incorrectly computed control flow");
+    }
+  }
+  assert(TotalInFlow == TotalOutFlow && "incorrectly computed control flow");
+}
+#endif
+
+} // end of anonymous namespace
+
+/// Apply the profile inference algorithm for a given flow function
+void llvm::applyFlowInference(FlowFunction &Func) {
+  // Create and apply an inference network model
+  auto InferenceNetwork = MinCostMaxFlow();
+  initializeNetwork(InferenceNetwork, Func);
+  InferenceNetwork.run();
+
+  // Extract flow values for every block and every edge
+  extractWeights(InferenceNetwork, Func);
+
+#ifndef NDEBUG
+  // Verify the result
+  verifyWeights(Func);
+#endif
+}
diff --git a/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp b/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp
index 6d995cf4c048..ea0e8343eb88 100644
--- a/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp
+++ b/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp
@@ -34,6 +34,10 @@ cl::opt<bool> NoWarnSampleUnused(
     cl::desc("Use this option to turn off/on warnings about function with "
              "samples but without debug information to use those samples. "));
 
+cl::opt<bool> SampleProfileUseProfi(
+    "sample-profile-use-profi", cl::init(false), cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Use profi to infer block and edge counts."));
+
 namespace sampleprofutil {
 
 /// Return true if the given callsite is hot wrt to hot cutoff threshold.
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index a042146d7ace..71c15d5c51fc 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -1833,22 +1834,6 @@ Value *SCEVExpander::expandCodeForImpl(const SCEV *SH, Type *Ty, bool Root) {
   return V;
 }
 
-/// Check whether value has nuw/nsw/exact set but SCEV does not.
-/// TODO: In reality it is better to check the poison recursively
-/// but this is better than nothing.
-static bool SCEVLostPoisonFlags(const SCEV *S, const Instruction *I) {
-  if (isa<OverflowingBinaryOperator>(I)) {
-    if (auto *NS = dyn_cast<SCEVNAryExpr>(S)) {
-      if (I->hasNoSignedWrap() && !NS->hasNoSignedWrap())
-        return true;
-      if (I->hasNoUnsignedWrap() && !NS->hasNoUnsignedWrap())
-        return true;
-    }
-  } else if (isa<PossiblyExactOperator>(I) && I->isExact())
-    return true;
-  return false;
-}
-
 ScalarEvolution::ValueOffsetPair
 SCEVExpander::FindValueInExprValueMap(const SCEV *S,
                                       const Instruction *InsertPt) {
@@ -1872,8 +1857,7 @@ SCEVExpander::FindValueInExprValueMap(const SCEV *S,
         if (S->getType() == V->getType() &&
             SE.DT.dominates(EntInst, InsertPt) &&
             (SE.LI.getLoopFor(EntInst->getParent()) == nullptr ||
-             SE.LI.getLoopFor(EntInst->getParent())->contains(InsertPt)) &&
-            !SCEVLostPoisonFlags(S, EntInst))
+             SE.LI.getLoopFor(EntInst->getParent())->contains(InsertPt)))
           return {V, Offset};
       }
     }
@@ -1952,26 +1936,36 @@ Value *SCEVExpander::expand(const SCEV *S) {
 
   if (!V)
     V = visit(S);
-  else if (VO.second) {
-    if (PointerType *Vty = dyn_cast<PointerType>(V->getType())) {
-      Type *Ety = Vty->getPointerElementType();
-      int64_t Offset = VO.second->getSExtValue();
-      int64_t ESize = SE.getTypeSizeInBits(Ety);
-      if ((Offset * 8) % ESize == 0) {
-        ConstantInt *Idx =
+  else {
+    // If we're reusing an existing instruction, we are effectively CSEing two
+    // copies of the instruction (with potentially different flags).  As such,
+    // we need to drop any poison generating flags unless we can prove that
+    // said flags must be valid for all new users.
+    if (auto *I = dyn_cast<Instruction>(V))
+      if (I->hasPoisonGeneratingFlags() && !programUndefinedIfPoison(I))
+        I->dropPoisonGeneratingFlags();
+
+    if (VO.second) {
+      if (PointerType *Vty = dyn_cast<PointerType>(V->getType())) {
+        Type *Ety = Vty->getPointerElementType();
+        int64_t Offset = VO.second->getSExtValue();
+        int64_t ESize = SE.getTypeSizeInBits(Ety);
+        if ((Offset * 8) % ESize == 0) {
+          ConstantInt *Idx =
             ConstantInt::getSigned(VO.second->getType(), -(Offset * 8) / ESize);
-        V = Builder.CreateGEP(Ety, V, Idx, "scevgep");
-      } else {
-        ConstantInt *Idx =
+          V = Builder.CreateGEP(Ety, V, Idx, "scevgep");
+        } else {
+          ConstantInt *Idx =
             ConstantInt::getSigned(VO.second->getType(), -Offset);
-        unsigned AS = Vty->getAddressSpace();
-        V = Builder.CreateBitCast(V, Type::getInt8PtrTy(SE.getContext(), AS));
-        V = Builder.CreateGEP(Type::getInt8Ty(SE.getContext()), V, Idx,
-                              "uglygep");
-        V = Builder.CreateBitCast(V, Vty);
+          unsigned AS = Vty->getAddressSpace();
+          V = Builder.CreateBitCast(V, Type::getInt8PtrTy(SE.getContext(), AS));
+          V = Builder.CreateGEP(Type::getInt8Ty(SE.getContext()), V, Idx,
+                                "uglygep");
+          V = Builder.CreateBitCast(V, Vty);
+        }
+      } else {
+        V = Builder.CreateSub(V, VO.second);
       }
-    } else {
-      V = Builder.CreateSub(V, VO.second);
     }
   }
   // Remember the expanded value for this SCEV at this location.
@@ -2180,7 +2174,9 @@ SCEVExpander::getRelatedExistingExpansion(const SCEV *S, const Instruction *At,
   }
 
   // Use expand's logic which is used for reusing a previous Value in
-  // ExprValueMap.
+  // ExprValueMap.  Note that we don't currently model the cost of
+  // needing to drop poison generating flags on the instruction if we
+  // want to reuse it.  We effectively assume that has zero cost.
   ScalarEvolution::ValueOffsetPair VO = FindValueInExprValueMap(S, At);
   if (VO.first)
     return VO;
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index f467de5f924e..afa3ecde77f9 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -3936,7 +3936,7 @@ bool SimplifyCFGOpt::SimplifyTerminatorOnSelect(Instruction *OldTerm,
   BasicBlock *KeepEdge1 = TrueBB;
   BasicBlock *KeepEdge2 = TrueBB != FalseBB ? FalseBB : nullptr;
 
-  SmallPtrSet<BasicBlock *, 2> RemovedSuccessors;
+  SmallSetVector<BasicBlock *, 2> RemovedSuccessors;
 
   // Then remove the rest.
   for (BasicBlock *Succ : successors(OldTerm)) {
@@ -4782,6 +4782,26 @@ static bool CasesAreContiguous(SmallVectorImpl<ConstantInt *> &Cases) {
   return true;
 }
 
+static void createUnreachableSwitchDefault(SwitchInst *Switch,
+                                           DomTreeUpdater *DTU) {
+  LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
+  auto *BB = Switch->getParent();
+  auto *OrigDefaultBlock = Switch->getDefaultDest();
+  OrigDefaultBlock->removePredecessor(BB);
+  BasicBlock *NewDefaultBlock = BasicBlock::Create(
+      BB->getContext(), BB->getName() + ".unreachabledefault", BB->getParent(),
+      OrigDefaultBlock);
+  new UnreachableInst(Switch->getContext(), NewDefaultBlock);
+  Switch->setDefaultDest(&*NewDefaultBlock);
+  if (DTU) {
+    SmallVector<DominatorTree::UpdateType, 2> Updates;
+    Updates.push_back({DominatorTree::Insert, BB, &*NewDefaultBlock});
+    if (!is_contained(successors(BB), OrigDefaultBlock))
+      Updates.push_back({DominatorTree::Delete, BB, &*OrigDefaultBlock});
+    DTU->applyUpdates(Updates);
+  }
+}
+
 /// Turn a switch with two reachable destinations into an integer range
 /// comparison and branch.
 bool SimplifyCFGOpt::TurnSwitchRangeIntoICmp(SwitchInst *SI,
@@ -4927,10 +4947,14 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
   // Gather dead cases.
   SmallVector<ConstantInt *, 8> DeadCases;
   SmallDenseMap<BasicBlock *, int, 8> NumPerSuccessorCases;
+  SmallVector<BasicBlock *, 8> UniqueSuccessors;
   for (auto &Case : SI->cases()) {
     auto *Successor = Case.getCaseSuccessor();
-    if (DTU)
+    if (DTU) {
+      if (!NumPerSuccessorCases.count(Successor))
+        UniqueSuccessors.push_back(Successor);
       ++NumPerSuccessorCases[Successor];
+    }
     const APInt &CaseVal = Case.getCaseValue()->getValue();
     if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) ||
         (CaseVal.getMinSignedBits() > MaxSignificantBitsInCond)) {
@@ -4973,9 +4997,9 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
 
   if (DTU) {
     std::vector<DominatorTree::UpdateType> Updates;
-    for (const std::pair<BasicBlock *, int> &I : NumPerSuccessorCases)
-      if (I.second == 0)
-        Updates.push_back({DominatorTree::Delete, SI->getParent(), I.first});
+    for (auto *Successor : UniqueSuccessors)
+      if (NumPerSuccessorCases[Successor] == 0)
+        Updates.push_back({DominatorTree::Delete, SI->getParent(), Successor});
     DTU->applyUpdates(Updates);
   }
 
@@ -6040,15 +6064,13 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
     if (Succ == SI->getDefaultDest())
       continue;
     Succ->removePredecessor(BB);
-    RemovedSuccessors.insert(Succ);
+    if (DTU && RemovedSuccessors.insert(Succ).second)
+      Updates.push_back({DominatorTree::Delete, BB, Succ});
   }
   SI->eraseFromParent();
 
-  if (DTU) {
-    for (BasicBlock *RemovedSuccessor : RemovedSuccessors)
-      Updates.push_back({DominatorTree::Delete, BB, RemovedSuccessor});
+  if (DTU)
     DTU->applyUpdates(Updates);
-  }
 
   ++NumLookupTables;
   if (NeedMask)
@@ -6215,7 +6237,7 @@ bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) {
 
   // Eliminate redundant destinations.
   SmallPtrSet<Value *, 8> Succs;
-  SmallPtrSet<BasicBlock *, 8> RemovedSuccs;
+  SmallSetVector<BasicBlock *, 8> RemovedSuccs;
   for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
     BasicBlock *Dest = IBI->getDestination(i);
     if (!Dest->hasAddressTaken() || !Succs.insert(Dest).second) {
@@ -6305,8 +6327,8 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
 
     // We've found an identical block.  Update our predecessors to take that
     // path instead and make ourselves dead.
-    SmallPtrSet<BasicBlock *, 16> Preds(pred_begin(BB), pred_end(BB));
-    for (BasicBlock *Pred : Preds) {
+    SmallSetVector<BasicBlock *, 16> UniquePreds(pred_begin(BB), pred_end(BB));
+    for (BasicBlock *Pred : UniquePreds) {
       InvokeInst *II = cast<InvokeInst>(Pred->getTerminator());
       assert(II->getNormalDest() != BB && II->getUnwindDest() == BB &&
              "unexpected successor");
@@ -6323,8 +6345,8 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
       if (isa<DbgInfoIntrinsic>(Inst))
         Inst.eraseFromParent();
 
-    SmallPtrSet<BasicBlock *, 16> Succs(succ_begin(BB), succ_end(BB));
-    for (BasicBlock *Succ : Succs) {
+    SmallSetVector<BasicBlock *, 16> UniqueSuccs(succ_begin(BB), succ_end(BB));
+    for (BasicBlock *Succ : UniqueSuccs) {
       Succ->removePredecessor(BB);
       if (DTU)
         Updates.push_back({DominatorTree::Delete, BB, Succ});