vendor/llvm/llvm-trunk-r321017

author: Dimitry Andric <dim@FreeBSD.org> 2017-12-18 20:10:56 +0000
committer: Dimitry Andric <dim@FreeBSD.org> 2017-12-18 20:10:56 +0000
commit: 044eb2f6afba375a914ac9d8024f8f5142bb912e (patch)
tree: 1475247dc9f9fe5be155ebd4c9069c75aadf8c20 /lib/Transforms/Scalar
parent: eb70dddbd77e120e5d490bd8fbe7ff3f8fa81c6b (diff)
54 files changed, 9667 insertions, 3020 deletions
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
index 5b467dc9fe12..1e683db50206 100644
--- a/lib/Transforms/Scalar/ADCE.cpp
+++ b/lib/Transforms/Scalar/ADCE.cpp
@@ -15,8 +15,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/ADCE.h"
-
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -27,13 +29,29 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
 #include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include <cassert>
+#include <cstddef>
+#include <utility>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "adce"
@@ -52,10 +70,12 @@ static cl::opt<bool> RemoveLoops("adce-remove-loops", cl::init(false),
                                  cl::Hidden);
 
 namespace {
+
 /// Information about Instructions
 struct InstInfoType {
   /// True if the associated instruction is live.
   bool Live = false;
+
   /// Quick access to information for block containing associated Instruction.
   struct BlockInfoType *Block = nullptr;
 };
@@ -64,10 +84,13 @@ struct InstInfoType {
 struct BlockInfoType {
   /// True when this block contains a live instructions.
   bool Live = false;
+
   /// True when this block ends in an unconditional branch.
   bool UnconditionalBranch = false;
+
   /// True when this block is known to have live PHI nodes.
   bool HasLivePhiNodes = false;
+
   /// Control dependence sources need to be live for this block.
   bool CFLive = false;
 
@@ -75,8 +98,6 @@ struct BlockInfoType {
   /// holds the value &InstInfo[Terminator]
   InstInfoType *TerminatorLiveInfo = nullptr;
 
-  bool terminatorIsLive() const { return TerminatorLiveInfo->Live; }
-
   /// Corresponding BasicBlock.
   BasicBlock *BB = nullptr;
 
@@ -85,14 +106,21 @@ struct BlockInfoType {
 
   /// Post-order numbering of reverse control flow graph.
   unsigned PostOrder;
+
+  bool terminatorIsLive() const { return TerminatorLiveInfo->Live; }
 };
 
 class AggressiveDeadCodeElimination {
   Function &F;
+
+  // ADCE does not use DominatorTree per se, but it updates it to preserve the
+  // analysis.
+  DominatorTree &DT;
   PostDominatorTree &PDT;
 
   /// Mapping of blocks to associated information, an element in BlockInfoVec.
-  DenseMap<BasicBlock *, BlockInfoType> BlockInfo;
+  /// Use MapVector to get deterministic iteration order.
+  MapVector<BasicBlock *, BlockInfoType> BlockInfo;
   bool isLive(BasicBlock *BB) { return BlockInfo[BB].Live; }
 
   /// Mapping of instructions to associated information.
@@ -102,6 +130,7 @@ class AggressiveDeadCodeElimination {
   /// Instructions known to be live where we need to mark
   /// reaching definitions as live.
   SmallVector<Instruction *, 128> Worklist;
+
   /// Debug info scopes around a live instruction.
   SmallPtrSet<const Metadata *, 32> AliveScopes;
 
@@ -116,15 +145,19 @@ class AggressiveDeadCodeElimination {
   /// Set up auxiliary data structures for Instructions and BasicBlocks and
   /// initialize the Worklist to the set of must-be-live Instruscions.
   void initialize();
+
   /// Return true for operations which are always treated as live.
   bool isAlwaysLive(Instruction &I);
+
   /// Return true for instrumentation instructions for value profiling.
   bool isInstrumentsConstant(Instruction &I);
 
   /// Propagate liveness to reaching definitions.
   void markLiveInstructions();
+
   /// Mark an instruction as live.
   void markLive(Instruction *I);
+
   /// Mark a block as live.
   void markLive(BlockInfoType &BB);
   void markLive(BasicBlock *BB) { markLive(BlockInfo[BB]); }
@@ -157,11 +190,14 @@ class AggressiveDeadCodeElimination {
   void makeUnconditional(BasicBlock *BB, BasicBlock *Target);
 
 public:
-  AggressiveDeadCodeElimination(Function &F, PostDominatorTree &PDT)
-      : F(F), PDT(PDT) {}
+  AggressiveDeadCodeElimination(Function &F, DominatorTree &DT,
+                                PostDominatorTree &PDT)
+      : F(F), DT(DT), PDT(PDT) {}
+
   bool performDeadCodeElimination();
 };
-}
+
+} // end anonymous namespace
 
 bool AggressiveDeadCodeElimination::performDeadCodeElimination() {
   initialize();
@@ -175,7 +211,6 @@ static bool isUnconditionalBranch(TerminatorInst *Term) {
 }
 
 void AggressiveDeadCodeElimination::initialize() {
-
   auto NumBlocks = F.size();
 
   // We will have an entry in the map for each block so we grow the
@@ -217,7 +252,8 @@ void AggressiveDeadCodeElimination::initialize() {
     // to recording which nodes have been visited we also record whether
     // a node is currently on the "stack" of active ancestors of the current
     // node.
-    typedef DenseMap<BasicBlock *, bool>  StatusMap ;
+    using StatusMap = DenseMap<BasicBlock *, bool>;
+
     class DFState : public StatusMap {
     public:
       std::pair<StatusMap::iterator, bool> insert(BasicBlock *BB) {
@@ -253,27 +289,23 @@ void AggressiveDeadCodeElimination::initialize() {
     }
   }
 
-  // Mark blocks live if there is no path from the block to the
-  // return of the function or a successor for which this is true.
-  // This protects IDFCalculator which cannot handle such blocks.
-  for (auto &BBInfoPair : BlockInfo) {
-    auto &BBInfo = BBInfoPair.second;
-    if (BBInfo.terminatorIsLive())
-      continue;
-    auto *BB = BBInfo.BB;
-    if (!PDT.getNode(BB)) {
-      DEBUG(dbgs() << "Not post-dominated by return: " << BB->getName()
+  // Mark blocks live if there is no path from the block to a
+  // return of the function.
+  // We do this by seeing which of the postdomtree root children exit the
+  // program, and for all others, mark the subtree live.
+  for (auto &PDTChild : children<DomTreeNode *>(PDT.getRootNode())) {
+    auto *BB = PDTChild->getBlock();
+    auto &Info = BlockInfo[BB];
+    // Real function return
+    if (isa<ReturnInst>(Info.Terminator)) {
+      DEBUG(dbgs() << "post-dom root child is a return: " << BB->getName()
                    << '\n';);
-      markLive(BBInfo.Terminator);
       continue;
     }
-    for (auto *Succ : successors(BB))
-      if (!PDT.getNode(Succ)) {
-        DEBUG(dbgs() << "Successor not post-dominated by return: "
-                     << BB->getName() << '\n';);
-        markLive(BBInfo.Terminator);
-        break;
-      }
+
+    // This child is something else, like an infinite loop.
+    for (auto DFNode : depth_first(PDTChild))
+      markLive(BlockInfo[DFNode->getBlock()].Terminator);
   }
 
   // Treat the entry block as always live
@@ -318,7 +350,6 @@ bool AggressiveDeadCodeElimination::isInstrumentsConstant(Instruction &I) {
 }
 
 void AggressiveDeadCodeElimination::markLiveInstructions() {
-
   // Propagate liveness backwards to operands.
   do {
     // Worklist holds newly discovered live instructions
@@ -343,7 +374,6 @@ void AggressiveDeadCodeElimination::markLiveInstructions() {
 }
 
 void AggressiveDeadCodeElimination::markLive(Instruction *I) {
-
   auto &Info = InstInfo[I];
   if (Info.Live)
     return;
@@ -430,7 +460,6 @@ void AggressiveDeadCodeElimination::markPhiLive(PHINode *PN) {
 }
 
 void AggressiveDeadCodeElimination::markLiveBranchesFromControlDependences() {
-
   if (BlocksWithDeadTerminators.empty())
     return;
 
@@ -469,7 +498,6 @@ void AggressiveDeadCodeElimination::markLiveBranchesFromControlDependences() {
 //
 //===----------------------------------------------------------------------===//
 bool AggressiveDeadCodeElimination::removeDeadInstructions() {
-
   // Updates control and dataflow around dead blocks
   updateDeadRegions();
 
@@ -527,7 +555,6 @@ bool AggressiveDeadCodeElimination::removeDeadInstructions() {
 
 // A dead region is the set of dead blocks with a common live post-dominator.
 void AggressiveDeadCodeElimination::updateDeadRegions() {
-
   DEBUG({
     dbgs() << "final dead terminator blocks: " << '\n';
     for (auto *BB : BlocksWithDeadTerminators)
@@ -561,21 +588,40 @@ void AggressiveDeadCodeElimination::updateDeadRegions() {
     }
     assert((PreferredSucc && PreferredSucc->PostOrder > 0) &&
            "Failed to find safe successor for dead branch");
+
+    // Collect removed successors to update the (Post)DominatorTrees.
+    SmallPtrSet<BasicBlock *, 4> RemovedSuccessors;
     bool First = true;
     for (auto *Succ : successors(BB)) {
-      if (!First || Succ != PreferredSucc->BB)
+      if (!First || Succ != PreferredSucc->BB) {
         Succ->removePredecessor(BB);
-      else
+        RemovedSuccessors.insert(Succ);
+      } else
         First = false;
     }
     makeUnconditional(BB, PreferredSucc->BB);
+
+    // Inform the dominators about the deleted CFG edges.
+    SmallVector<DominatorTree::UpdateType, 4> DeletedEdges;
+    for (auto *Succ : RemovedSuccessors) {
+      // It might have happened that the same successor appeared multiple times
+      // and the CFG edge wasn't really removed.
+      if (Succ != PreferredSucc->BB) {
+        DEBUG(dbgs() << "ADCE: (Post)DomTree edge enqueued for deletion"
+                     << BB->getName() << " -> " << Succ->getName() << "\n");
+        DeletedEdges.push_back({DominatorTree::Delete, BB, Succ});
+      }
+    }
+
+    DT.applyUpdates(DeletedEdges);
+    PDT.applyUpdates(DeletedEdges);
+
     NumBranchesRemoved += 1;
   }
 }
 
 // reverse top-sort order
 void AggressiveDeadCodeElimination::computeReversePostOrder() {
-
   // This provides a post-order numbering of the reverse control flow graph
   // Note that it is incomplete in the presence of infinite loops but we don't
   // need numbers blocks which don't reach the end of the functions since
@@ -613,6 +659,9 @@ void AggressiveDeadCodeElimination::makeUnconditional(BasicBlock *BB,
   InstInfo[NewTerm].Live = true;
   if (const DILocation *DL = PredTerm->getDebugLoc())
     NewTerm->setDebugLoc(DL);
+
+  InstInfo.erase(PredTerm);
+  PredTerm->eraseFromParent();
 }
 
 //===----------------------------------------------------------------------===//
@@ -621,19 +670,24 @@ void AggressiveDeadCodeElimination::makeUnconditional(BasicBlock *BB,
 //
 //===----------------------------------------------------------------------===//
 PreservedAnalyses ADCEPass::run(Function &F, FunctionAnalysisManager &FAM) {
+  auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
   auto &PDT = FAM.getResult<PostDominatorTreeAnalysis>(F);
-  if (!AggressiveDeadCodeElimination(F, PDT).performDeadCodeElimination())
+  if (!AggressiveDeadCodeElimination(F, DT, PDT).performDeadCodeElimination())
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
   PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<PostDominatorTreeAnalysis>();
   return PA;
 }
 
 namespace {
+
 struct ADCELegacyPass : public FunctionPass {
   static char ID; // Pass identification, replacement for typeid
+
   ADCELegacyPass() : FunctionPass(ID) {
     initializeADCELegacyPassPass(*PassRegistry::getPassRegistry());
   }
@@ -641,22 +695,34 @@ struct ADCELegacyPass : public FunctionPass {
   bool runOnFunction(Function &F) override {
     if (skipFunction(F))
       return false;
+
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
-    return AggressiveDeadCodeElimination(F, PDT).performDeadCodeElimination();
+    return AggressiveDeadCodeElimination(F, DT, PDT)
+        .performDeadCodeElimination();
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    // We require DominatorTree here only to update and thus preserve it.
+    AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<PostDominatorTreeWrapperPass>();
     if (!RemoveControlFlowFlag)
       AU.setPreservesCFG();
+    else {
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addPreserved<PostDominatorTreeWrapperPass>();
+    }
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
 };
-}
+
+} // end anonymous namespace
 
 char ADCELegacyPass::ID = 0;
+
 INITIALIZE_PASS_BEGIN(ADCELegacyPass, "adce",
                       "Aggressive Dead Code Elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
 INITIALIZE_PASS_END(ADCELegacyPass, "adce", "Aggressive Dead Code Elimination",
                     false, false)
diff --git a/lib/Transforms/Scalar/BDCE.cpp b/lib/Transforms/Scalar/BDCE.cpp
index 2e5618686ec2..851efa000f65 100644
--- a/lib/Transforms/Scalar/BDCE.cpp
+++ b/lib/Transforms/Scalar/BDCE.cpp
@@ -20,11 +20,8 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/IR/CFG.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Operator.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -48,8 +45,18 @@ static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) {
     // If all bits of a user are demanded, then we know that nothing below that
     // in the def-use chain needs to be changed.
     auto *J = dyn_cast<Instruction>(JU);
-    if (J && !DB.getDemandedBits(J).isAllOnesValue())
+    if (J && J->getType()->isSized() &&
+        !DB.getDemandedBits(J).isAllOnesValue())
       WorkList.push_back(J);
+
+    // Note that we need to check for unsized types above before asking for
+    // demanded bits. Normally, the only way to reach an instruction with an
+    // unsized type is via an instruction that has side effects (or otherwise
+    // will demand its input bits). However, if we have a readnone function
+    // that returns an unsized type (e.g., void), we must avoid asking for the
+    // demanded bits of the function call's return value. A void-returning
+    // readnone function is always dead (and so we can stop walking the use/def
+    // chain here), but the check is necessary to avoid asserting.
   }
 
   // DFS through subsequent users while tracking visits to avoid cycles.
@@ -70,7 +77,8 @@ static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) {
       // If all bits of a user are demanded, then we know that nothing below
       // that in the def-use chain needs to be changed.
       auto *K = dyn_cast<Instruction>(KU);
-      if (K && !Visited.count(K) && !DB.getDemandedBits(K).isAllOnesValue())
+      if (K && !Visited.count(K) && K->getType()->isSized() &&
+          !DB.getDemandedBits(K).isAllOnesValue())
         WorkList.push_back(K);
     }
   }
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index 457c9427ab9a..0562d3882f8b 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -2,11 +2,13 @@ add_llvm_library(LLVMScalarOpts
   ADCE.cpp
   AlignmentFromAssumptions.cpp
   BDCE.cpp
+  CallSiteSplitting.cpp
   ConstantHoisting.cpp
   ConstantProp.cpp
   CorrelatedValuePropagation.cpp
   DCE.cpp
   DeadStoreElimination.cpp
+  DivRemPairs.cpp
   EarlyCSE.cpp
   FlattenCFGPass.cpp
   Float2Int.cpp
@@ -42,6 +44,7 @@ add_llvm_library(LLVMScalarOpts
   LowerExpectIntrinsic.cpp
   LowerGuardIntrinsic.cpp
   MemCpyOptimizer.cpp
+  MergeICmps.cpp
   MergedLoadStoreMotion.cpp
   NaryReassociate.cpp
   NewGVN.cpp
@@ -59,6 +62,7 @@ add_llvm_library(LLVMScalarOpts
   SimplifyCFGPass.cpp
   Sink.cpp
   SpeculativeExecution.cpp
+  SpeculateAroundPHIs.cpp
   StraightLineStrengthReduce.cpp
   StructurizeCFG.cpp
   TailRecursionElimination.cpp
diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp
new file mode 100644
index 000000000000..d8c408035038
--- /dev/null
+++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -0,0 +1,428 @@
+//===- CallSiteSplitting.cpp ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a transformation that tries to split a call-site to pass
+// more constrained arguments if its argument is predicated in the control flow
+// so that we can expose better context to the later passes (e.g, inliner, jump
+// threading, or IPA-CP based function cloning, etc.).
+// As of now we support two cases :
+//
+// 1) If a call site is dominated by an OR condition and if any of its arguments
+// are predicated on this OR condition, try to split the condition with more
+// constrained arguments. For example, in the code below, we try to split the
+// call site since we can predicate the argument(ptr) based on the OR condition.
+//
+// Split from :
+//   if (!ptr || c)
+//     callee(ptr);
+// to :
+//   if (!ptr)
+//     callee(null)         // set the known constant value
+//   else if (c)
+//     callee(nonnull ptr)  // set non-null attribute in the argument
+//
+// 2) We can also split a call-site based on constant incoming values of a PHI
+// For example,
+// from :
+//   Header:
+//    %c = icmp eq i32 %i1, %i2
+//    br i1 %c, label %Tail, label %TBB
+//   TBB:
+//    br label Tail%
+//   Tail:
+//    %p = phi i32 [ 0, %Header], [ 1, %TBB]
+//    call void @bar(i32 %p)
+// to
+//   Header:
+//    %c = icmp eq i32 %i1, %i2
+//    br i1 %c, label %Tail-split0, label %TBB
+//   TBB:
+//    br label %Tail-split1
+//   Tail-split0:
+//    call void @bar(i32 0)
+//    br label %Tail
+//   Tail-split1:
+//    call void @bar(i32 1)
+//    br label %Tail
+//   Tail:
+//    %p = phi i32 [ 0, %Tail-split0 ], [ 1, %Tail-split1 ]
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/CallSiteSplitting.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "callsite-splitting"
+
+STATISTIC(NumCallSiteSplit, "Number of call-site split");
+
+static void addNonNullAttribute(Instruction *CallI, Instruction *NewCallI,
+                                Value *Op) {
+  CallSite CS(NewCallI);
+  unsigned ArgNo = 0;
+  for (auto &I : CS.args()) {
+    if (&*I == Op)
+      CS.addParamAttr(ArgNo, Attribute::NonNull);
+    ++ArgNo;
+  }
+}
+
+static void setConstantInArgument(Instruction *CallI, Instruction *NewCallI,
+                                  Value *Op, Constant *ConstValue) {
+  CallSite CS(NewCallI);
+  unsigned ArgNo = 0;
+  for (auto &I : CS.args()) {
+    if (&*I == Op)
+      CS.setArgument(ArgNo, ConstValue);
+    ++ArgNo;
+  }
+}
+
+static bool isCondRelevantToAnyCallArgument(ICmpInst *Cmp, CallSite CS) {
+  assert(isa<Constant>(Cmp->getOperand(1)) && "Expected a constant operand.");
+  Value *Op0 = Cmp->getOperand(0);
+  unsigned ArgNo = 0;
+  for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E;
+       ++I, ++ArgNo) {
+    // Don't consider constant or arguments that are already known non-null.
+    if (isa<Constant>(*I) || CS.paramHasAttr(ArgNo, Attribute::NonNull))
+      continue;
+
+    if (*I == Op0)
+      return true;
+  }
+  return false;
+}
+
+/// If From has a conditional jump to To, add the condition to Conditions,
+/// if it is relevant to any argument at CS.
+static void
+recordCondition(const CallSite &CS, BasicBlock *From, BasicBlock *To,
+                SmallVectorImpl<std::pair<ICmpInst *, unsigned>> &Conditions) {
+  auto *BI = dyn_cast<BranchInst>(From->getTerminator());
+  if (!BI || !BI->isConditional())
+    return;
+
+  CmpInst::Predicate Pred;
+  Value *Cond = BI->getCondition();
+  if (!match(Cond, m_ICmp(Pred, m_Value(), m_Constant())))
+    return;
+
+  ICmpInst *Cmp = cast<ICmpInst>(Cond);
+  if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE)
+    if (isCondRelevantToAnyCallArgument(Cmp, CS))
+      Conditions.push_back({Cmp, From->getTerminator()->getSuccessor(0) == To
+                                     ? Pred
+                                     : Cmp->getInversePredicate()});
+}
+
+/// Record ICmp conditions relevant to any argument in CS following Pred's
+/// single successors. If there are conflicting conditions along a path, like
+/// x == 1 and x == 0, the first condition will be used.
+static void
+recordConditions(const CallSite &CS, BasicBlock *Pred,
+                 SmallVectorImpl<std::pair<ICmpInst *, unsigned>> &Conditions) {
+  recordCondition(CS, Pred, CS.getInstruction()->getParent(), Conditions);
+  BasicBlock *From = Pred;
+  BasicBlock *To = Pred;
+  SmallPtrSet<BasicBlock *, 4> Visited = {From};
+  while (!Visited.count(From->getSinglePredecessor()) &&
+         (From = From->getSinglePredecessor())) {
+    recordCondition(CS, From, To, Conditions);
+    To = From;
+  }
+}
+
+static Instruction *
+addConditions(CallSite &CS,
+              SmallVectorImpl<std::pair<ICmpInst *, unsigned>> &Conditions) {
+  if (Conditions.empty())
+    return nullptr;
+
+  Instruction *NewCI = CS.getInstruction()->clone();
+  for (auto &Cond : Conditions) {
+    Value *Arg = Cond.first->getOperand(0);
+    Constant *ConstVal = cast<Constant>(Cond.first->getOperand(1));
+    if (Cond.second == ICmpInst::ICMP_EQ)
+      setConstantInArgument(CS.getInstruction(), NewCI, Arg, ConstVal);
+    else if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) {
+      assert(Cond.second == ICmpInst::ICMP_NE);
+      addNonNullAttribute(CS.getInstruction(), NewCI, Arg);
+    }
+  }
+  return NewCI;
+}
+
+static SmallVector<BasicBlock *, 2> getTwoPredecessors(BasicBlock *BB) {
+  SmallVector<BasicBlock *, 2> Preds(predecessors((BB)));
+  assert(Preds.size() == 2 && "Expected exactly 2 predecessors!");
+  return Preds;
+}
+
+static bool canSplitCallSite(CallSite CS) {
+  // FIXME: As of now we handle only CallInst. InvokeInst could be handled
+  // without too much effort.
+  Instruction *Instr = CS.getInstruction();
+  if (!isa<CallInst>(Instr))
+    return false;
+
+  // Allow splitting a call-site only when there is no instruction before the
+  // call-site in the basic block. Based on this constraint, we only clone the
+  // call instruction, and we do not move a call-site across any other
+  // instruction.
+  BasicBlock *CallSiteBB = Instr->getParent();
+  if (Instr != CallSiteBB->getFirstNonPHIOrDbg())
+    return false;
+
+  // Need 2 predecessors and cannot split an edge from an IndirectBrInst.
+  SmallVector<BasicBlock *, 2> Preds(predecessors(CallSiteBB));
+  if (Preds.size() != 2 || isa<IndirectBrInst>(Preds[0]->getTerminator()) ||
+      isa<IndirectBrInst>(Preds[1]->getTerminator()))
+    return false;
+
+  return CallSiteBB->canSplitPredecessors();
+}
+
+/// Return true if the CS is split into its new predecessors which are directly
+/// hooked to each of its orignial predecessors pointed by PredBB1 and PredBB2.
+/// In OR predicated case, PredBB1 will point the header, and PredBB2 will point
+/// to the second compare block. CallInst1 and CallInst2 will be the new
+/// call-sites placed in the new predecessors split for PredBB1 and PredBB2,
+/// repectively. Therefore, CallInst1 will be the call-site placed
+/// between Header and Tail, and CallInst2 will be the call-site between TBB and
+/// Tail. For example, in the IR below with an OR condition, the call-site can
+/// be split
+///
+/// from :
+///
+///   Header:
+///     %c = icmp eq i32* %a, null
+///     br i1 %c %Tail, %TBB
+///   TBB:
+///     %c2 = icmp eq i32* %b, null
+///     br i1 %c %Tail, %End
+///   Tail:
+///     %ca = call i1  @callee (i32* %a, i32* %b)
+///
+///  to :
+///
+///   Header:                          // PredBB1 is Header
+///     %c = icmp eq i32* %a, null
+///     br i1 %c %Tail-split1, %TBB
+///   TBB:                             // PredBB2 is TBB
+///     %c2 = icmp eq i32* %b, null
+///     br i1 %c %Tail-split2, %End
+///   Tail-split1:
+///     %ca1 = call @callee (i32* null, i32* %b)         // CallInst1
+///    br %Tail
+///   Tail-split2:
+///     %ca2 = call @callee (i32* nonnull %a, i32* null) // CallInst2
+///    br %Tail
+///   Tail:
+///    %p = phi i1 [%ca1, %Tail-split1],[%ca2, %Tail-split2]
+///
+/// Note that for an OR predicated case, CallInst1 and CallInst2 should be
+/// created with more constrained arguments in
+/// createCallSitesOnOrPredicatedArgument().
+static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2,
+                          Instruction *CallInst1, Instruction *CallInst2) {
+  Instruction *Instr = CS.getInstruction();
+  BasicBlock *TailBB = Instr->getParent();
+  assert(Instr == (TailBB->getFirstNonPHIOrDbg()) && "Unexpected call-site");
+
+  BasicBlock *SplitBlock1 =
+      SplitBlockPredecessors(TailBB, PredBB1, ".predBB1.split");
+  BasicBlock *SplitBlock2 =
+      SplitBlockPredecessors(TailBB, PredBB2, ".predBB2.split");
+
+  assert((SplitBlock1 && SplitBlock2) && "Unexpected new basic block split.");
+
+  if (!CallInst1)
+    CallInst1 = Instr->clone();
+  if (!CallInst2)
+    CallInst2 = Instr->clone();
+
+  CallInst1->insertBefore(&*SplitBlock1->getFirstInsertionPt());
+  CallInst2->insertBefore(&*SplitBlock2->getFirstInsertionPt());
+
+  CallSite CS1(CallInst1);
+  CallSite CS2(CallInst2);
+
+  // Handle PHIs used as arguments in the call-site.
+  for (auto &PI : *TailBB) {
+    PHINode *PN = dyn_cast<PHINode>(&PI);
+    if (!PN)
+      break;
+    unsigned ArgNo = 0;
+    for (auto &CI : CS.args()) {
+      if (&*CI == PN) {
+        CS1.setArgument(ArgNo, PN->getIncomingValueForBlock(SplitBlock1));
+        CS2.setArgument(ArgNo, PN->getIncomingValueForBlock(SplitBlock2));
+      }
+      ++ArgNo;
+    }
+  }
+
+  // Replace users of the original call with a PHI mering call-sites split.
+  if (Instr->getNumUses()) {
+    PHINode *PN = PHINode::Create(Instr->getType(), 2, "phi.call",
+                                  TailBB->getFirstNonPHI());
+    PN->addIncoming(CallInst1, SplitBlock1);
+    PN->addIncoming(CallInst2, SplitBlock2);
+    Instr->replaceAllUsesWith(PN);
+  }
+  DEBUG(dbgs() << "split call-site : " << *Instr << " into \n");
+  DEBUG(dbgs() << "    " << *CallInst1 << " in " << SplitBlock1->getName()
+               << "\n");
+  DEBUG(dbgs() << "    " << *CallInst2 << " in " << SplitBlock2->getName()
+               << "\n");
+  Instr->eraseFromParent();
+  NumCallSiteSplit++;
+}
+
+// Return true if the call-site has an argument which is a PHI with only
+// constant incoming values.
+static bool isPredicatedOnPHI(CallSite CS) {
+  Instruction *Instr = CS.getInstruction();
+  BasicBlock *Parent = Instr->getParent();
+  if (Instr != Parent->getFirstNonPHIOrDbg())
+    return false;
+
+  for (auto &BI : *Parent) {
+    if (PHINode *PN = dyn_cast<PHINode>(&BI)) {
+      for (auto &I : CS.args())
+        if (&*I == PN) {
+          assert(PN->getNumIncomingValues() == 2 &&
+                 "Unexpected number of incoming values");
+          if (PN->getIncomingBlock(0) == PN->getIncomingBlock(1))
+            return false;
+          if (PN->getIncomingValue(0) == PN->getIncomingValue(1))
+            continue;
+          if (isa<Constant>(PN->getIncomingValue(0)) &&
+              isa<Constant>(PN->getIncomingValue(1)))
+            return true;
+        }
+    }
+    break;
+  }
+  return false;
+}
+
+static bool tryToSplitOnPHIPredicatedArgument(CallSite CS) {
+  if (!isPredicatedOnPHI(CS))
+    return false;
+
+  auto Preds = getTwoPredecessors(CS.getInstruction()->getParent());
+  splitCallSite(CS, Preds[0], Preds[1], nullptr, nullptr);
+  return true;
+}
+// Check if one of the predecessors is a single predecessors of the other.
+// This is a requirement for control flow modeling an OR. HeaderBB points to
+// the single predecessor and OrBB points to other node. HeaderBB potentially
+// contains the first compare of the OR and OrBB the second.
+static bool isOrHeader(BasicBlock *HeaderBB, BasicBlock *OrBB) {
+  return OrBB->getSinglePredecessor() == HeaderBB &&
+         HeaderBB->getTerminator()->getNumSuccessors() == 2;
+}
+
+static bool tryToSplitOnOrPredicatedArgument(CallSite CS) {
+  auto Preds = getTwoPredecessors(CS.getInstruction()->getParent());
+  if (!isOrHeader(Preds[0], Preds[1]) && !isOrHeader(Preds[1], Preds[0]))
+    return false;
+
+  SmallVector<std::pair<ICmpInst *, unsigned>, 2> C1, C2;
+  recordConditions(CS, Preds[0], C1);
+  recordConditions(CS, Preds[1], C2);
+
+  Instruction *CallInst1 = addConditions(CS, C1);
+  Instruction *CallInst2 = addConditions(CS, C2);
+  if (!CallInst1 && !CallInst2)
+    return false;
+
+  splitCallSite(CS, Preds[1], Preds[0], CallInst2, CallInst1);
+  return true;
+}
+
+static bool tryToSplitCallSite(CallSite CS) {
+  if (!CS.arg_size() || !canSplitCallSite(CS))
+    return false;
+  return tryToSplitOnOrPredicatedArgument(CS) ||
+         tryToSplitOnPHIPredicatedArgument(CS);
+}
+
+static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI) {
+  bool Changed = false;
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;) {
+    BasicBlock &BB = *BI++;
+    for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) {
+      Instruction *I = &*II++;
+      CallSite CS(cast<Value>(I));
+      if (!CS || isa<IntrinsicInst>(I) || isInstructionTriviallyDead(I, &TLI))
+        continue;
+
+      Function *Callee = CS.getCalledFunction();
+      if (!Callee || Callee->isDeclaration())
+        continue;
+      Changed |= tryToSplitCallSite(CS);
+    }
+  }
+  return Changed;
+}
+
+namespace {
+struct CallSiteSplittingLegacyPass : public FunctionPass {
+  static char ID;
+  CallSiteSplittingLegacyPass() : FunctionPass(ID) {
+    initializeCallSiteSplittingLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    return doCallSiteSplitting(F, TLI);
+  }
+};
+} // namespace
+
+char CallSiteSplittingLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(CallSiteSplittingLegacyPass, "callsite-splitting",
+                      "Call-site splitting", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(CallSiteSplittingLegacyPass, "callsite-splitting",
+                    "Call-site splitting", false, false)
+FunctionPass *llvm::createCallSiteSplittingPass() {
+  return new CallSiteSplittingLegacyPass();
+}
+
+PreservedAnalyses CallSiteSplittingPass::run(Function &F,
+                                             FunctionAnalysisManager &AM) {
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+
+  if (!doCallSiteSplitting(F, TLI))
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  return PA;
+}
diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp
index 122c9314e022..e4b08c5ed305 100644
--- a/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -34,18 +34,39 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/ConstantHoisting.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
 #include <tuple>
+#include <utility>
 
 using namespace llvm;
 using namespace consthoist;
@@ -62,10 +83,12 @@ static cl::opt<bool> ConstHoistWithBlockFrequency(
              "without hoisting."));
 
 namespace {
+
 /// \brief The constant hoisting pass.
 class ConstantHoistingLegacyPass : public FunctionPass {
 public:
   static char ID; // Pass identification, replacement for typeid
+
   ConstantHoistingLegacyPass() : FunctionPass(ID) {
     initializeConstantHoistingLegacyPassPass(*PassRegistry::getPassRegistry());
   }
@@ -87,9 +110,11 @@ public:
 private:
   ConstantHoistingPass Impl;
 };
-}
+
+} // end anonymous namespace
 
 char ConstantHoistingLegacyPass::ID = 0;
+
 INITIALIZE_PASS_BEGIN(ConstantHoistingLegacyPass, "consthoist",
                       "Constant Hoisting", false, false)
 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
@@ -128,7 +153,6 @@ bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) {
   return MadeChange;
 }
 
-
 /// \brief Find the constant materialization insertion point.
 Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
                                                    unsigned Idx) const {
@@ -217,8 +241,9 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
   }
 
   // Visit Orders in bottom-up order.
-  typedef std::pair<SmallPtrSet<BasicBlock *, 16>, BlockFrequency>
-      InsertPtsCostPair;
+  using InsertPtsCostPair =
+      std::pair<SmallPtrSet<BasicBlock *, 16>, BlockFrequency>;
+
   // InsertPtsMap is a map from a BB to the best insertion points for the
   // subtree of BB (subtree not including the BB itself).
   DenseMap<BasicBlock *, InsertPtsCostPair> InsertPtsMap;
@@ -310,7 +335,6 @@ SmallPtrSet<Instruction *, 8> ConstantHoistingPass::findConstantInsertionPoint(
   return InsertPts;
 }
 
-
 /// \brief Record constant integer ConstInt for instruction Inst at operand
 /// index Idx.
 ///
@@ -351,7 +375,6 @@ void ConstantHoistingPass::collectConstantCandidates(
   }
 }
 
-
 /// \brief Check the operand for instruction Inst at index Idx.
 void ConstantHoistingPass::collectConstantCandidates(
     ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx) {
@@ -393,7 +416,6 @@ void ConstantHoistingPass::collectConstantCandidates(
   }
 }
 
-
 /// \brief Scan the instruction for expensive integer constants and record them
 /// in the constant candidate vector.
 void ConstantHoistingPass::collectConstantCandidates(
@@ -427,9 +449,8 @@ void ConstantHoistingPass::collectConstantCandidates(Function &Fn) {
 // bit widths (APInt Operator- does not like that). If the value cannot be
 // represented in uint64 we return an "empty" APInt. This is then interpreted
 // as the value is not in range.
-static llvm::Optional<APInt> calculateOffsetDiff(const APInt &V1,
-                                                 const APInt &V2) {
-  llvm::Optional<APInt> Res = None;
+static Optional<APInt> calculateOffsetDiff(const APInt &V1, const APInt &V2) {
+  Optional<APInt> Res = None;
   unsigned BW = V1.getBitWidth() > V2.getBitWidth() ?
                 V1.getBitWidth() : V2.getBitWidth();
   uint64_t LimVal1 = V1.getLimitedValue();
@@ -496,9 +517,9 @@ ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S,
       DEBUG(dbgs() << "Cost: " << Cost << "\n");
 
       for (auto C2 = S; C2 != E; ++C2) {
-        llvm::Optional<APInt> Diff = calculateOffsetDiff(
-                                      C2->ConstInt->getValue(),
-                                      ConstCand->ConstInt->getValue());
+        Optional<APInt> Diff = calculateOffsetDiff(
+                                   C2->ConstInt->getValue(),
+                                   ConstCand->ConstInt->getValue());
         if (Diff) {
           const int ImmCosts =
             TTI->getIntImmCodeSizeCost(Opcode, OpndIdx, Diff.getValue(), Ty);
@@ -696,6 +717,9 @@ bool ConstantHoistingPass::emitBaseConstants() {
       IntegerType *Ty = ConstInfo.BaseConstant->getType();
       Instruction *Base =
           new BitCastInst(ConstInfo.BaseConstant, Ty, "const", IP);
+
+      Base->setDebugLoc(IP->getDebugLoc());
+
       DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant
                    << ") to BB " << IP->getParent()->getName() << '\n'
                    << *Base << '\n');
@@ -714,6 +738,8 @@ bool ConstantHoistingPass::emitBaseConstants() {
             emitBaseConstants(Base, RCI.Offset, U);
             ReBasesNum++;
           }
+
+          Base->setDebugLoc(DILocation::getMergedLocation(Base->getDebugLoc(), U.Inst->getDebugLoc()));
         }
       }
       UsesNum = Uses;
@@ -722,7 +748,6 @@ bool ConstantHoistingPass::emitBaseConstants() {
       assert(!Base->use_empty() && "The use list is empty!?");
       assert(isa<Instruction>(Base->user_back()) &&
              "All uses should be instructions.");
-      Base->setDebugLoc(cast<Instruction>(Base->user_back())->getDebugLoc());
     }
     (void)UsesNum;
     (void)ReBasesNum;
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 28157783daa7..8f468ebf8949 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -12,22 +12,41 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <utility>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "correlated-value-propagation"
@@ -41,13 +60,16 @@ STATISTIC(NumDeadCases, "Number of switch cases removed");
 STATISTIC(NumSDivs,     "Number of sdiv converted to udiv");
 STATISTIC(NumAShrs,     "Number of ashr converted to lshr");
 STATISTIC(NumSRems,     "Number of srem converted to urem");
+STATISTIC(NumOverflows, "Number of overflow checks removed");
 
 static cl::opt<bool> DontProcessAdds("cvp-dont-process-adds", cl::init(true));
 
 namespace {
+
   class CorrelatedValuePropagation : public FunctionPass {
   public:
     static char ID;
+
     CorrelatedValuePropagation(): FunctionPass(ID) {
      initializeCorrelatedValuePropagationPass(*PassRegistry::getPassRegistry());
     }
@@ -59,9 +81,11 @@ namespace {
       AU.addPreserved<GlobalsAAWrapperPass>();
     }
   };
-}
+
+} // end anonymous namespace
 
 char CorrelatedValuePropagation::ID = 0;
+
 INITIALIZE_PASS_BEGIN(CorrelatedValuePropagation, "correlated-propagation",
                 "Value Propagation", false, false)
 INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
@@ -302,11 +326,72 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
   return Changed;
 }
 
+// See if we can prove that the given overflow intrinsic will not overflow.
+static bool willNotOverflow(IntrinsicInst *II, LazyValueInfo *LVI) {
+  using OBO = OverflowingBinaryOperator;
+  auto NoWrap = [&] (Instruction::BinaryOps BinOp, unsigned NoWrapKind) {
+    Value *RHS = II->getOperand(1);
+    ConstantRange RRange = LVI->getConstantRange(RHS, II->getParent(), II);
+    ConstantRange NWRegion = ConstantRange::makeGuaranteedNoWrapRegion(
+        BinOp, RRange, NoWrapKind);
+    // As an optimization, do not compute LRange if we do not need it.
+    if (NWRegion.isEmptySet())
+      return false;
+    Value *LHS = II->getOperand(0);
+    ConstantRange LRange = LVI->getConstantRange(LHS, II->getParent(), II);
+    return NWRegion.contains(LRange);
+  };
+  switch (II->getIntrinsicID()) {
+  default:
+    break;
+  case Intrinsic::uadd_with_overflow:
+    return NoWrap(Instruction::Add, OBO::NoUnsignedWrap);
+  case Intrinsic::sadd_with_overflow:
+    return NoWrap(Instruction::Add, OBO::NoSignedWrap);
+  case Intrinsic::usub_with_overflow:
+    return NoWrap(Instruction::Sub, OBO::NoUnsignedWrap);
+  case Intrinsic::ssub_with_overflow:
+    return NoWrap(Instruction::Sub, OBO::NoSignedWrap);
+  }
+  return false;
+}
+
+static void processOverflowIntrinsic(IntrinsicInst *II) {
+  Value *NewOp = nullptr;
+  switch (II->getIntrinsicID()) {
+  default:
+    llvm_unreachable("Unexpected instruction.");
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::sadd_with_overflow:
+    NewOp = BinaryOperator::CreateAdd(II->getOperand(0), II->getOperand(1),
+                                      II->getName(), II);
+    break;
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+    NewOp = BinaryOperator::CreateSub(II->getOperand(0), II->getOperand(1),
+                                      II->getName(), II);
+    break;
+  }
+  ++NumOverflows;
+  IRBuilder<> B(II);
+  Value *NewI = B.CreateInsertValue(UndefValue::get(II->getType()), NewOp, 0);
+  NewI = B.CreateInsertValue(NewI, ConstantInt::getFalse(II->getContext()), 1);
+  II->replaceAllUsesWith(NewI);
+  II->eraseFromParent();
+}
+
 /// Infer nonnull attributes for the arguments at the specified callsite.
 static bool processCallSite(CallSite CS, LazyValueInfo *LVI) {
   SmallVector<unsigned, 4> ArgNos;
   unsigned ArgNo = 0;
 
+  if (auto *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
+    if (willNotOverflow(II, LVI)) {
+      processOverflowIntrinsic(II);
+      return true;
+    }
+  }
+
   for (Value *V : CS.args()) {
     PointerType *Type = dyn_cast<PointerType>(V->getType());
     // Try to mark pointer typed parameters as non-null.  We skip the
@@ -335,18 +420,6 @@ static bool processCallSite(CallSite CS, LazyValueInfo *LVI) {
   return true;
 }
 
-// Helper function to rewrite srem and sdiv. As a policy choice, we choose not
-// to waste compile time on anything where the operands are local defs.  While
-// LVI can sometimes reason about such cases, it's not its primary purpose.
-static bool hasLocalDefs(BinaryOperator *SDI) {
-  for (Value *O : SDI->operands()) {
-    auto *I = dyn_cast<Instruction>(O);
-    if (I && I->getParent() == SDI->getParent())
-      return true;
-  }
-  return false;
-}
-
 static bool hasPositiveOperands(BinaryOperator *SDI, LazyValueInfo *LVI) {
   Constant *Zero = ConstantInt::get(SDI->getType(), 0);
   for (Value *O : SDI->operands()) {
@@ -358,7 +431,7 @@ static bool hasPositiveOperands(BinaryOperator *SDI, LazyValueInfo *LVI) {
 }
 
 static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
-  if (SDI->getType()->isVectorTy() || hasLocalDefs(SDI) ||
+  if (SDI->getType()->isVectorTy() ||
       !hasPositiveOperands(SDI, LVI))
     return false;
 
@@ -376,7 +449,7 @@ static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
 /// conditions, this can sometimes prove conditions instcombine can't by
 /// exploiting range information.
 static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) {
-  if (SDI->getType()->isVectorTy() || hasLocalDefs(SDI) ||
+  if (SDI->getType()->isVectorTy() ||
       !hasPositiveOperands(SDI, LVI))
     return false;
 
@@ -391,7 +464,7 @@ static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) {
 }
 
 static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) {
-  if (SDI->getType()->isVectorTy() || hasLocalDefs(SDI))
+  if (SDI->getType()->isVectorTy())
     return false;
 
   Constant *Zero = ConstantInt::get(SDI->getType(), 0);
@@ -410,12 +483,12 @@ static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) {
 }
 
 static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) {
-  typedef OverflowingBinaryOperator OBO;
+  using OBO = OverflowingBinaryOperator;
 
   if (DontProcessAdds)
     return false;
 
-  if (AddOp->getType()->isVectorTy() || hasLocalDefs(AddOp))
+  if (AddOp->getType()->isVectorTy())
     return false;
 
   bool NSW = AddOp->hasNoSignedWrap();
@@ -492,7 +565,7 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) {
   // blocks before querying later blocks (which require us to analyze early
   // blocks).  Eagerly simplifying shallow blocks means there is strictly less
   // work to do for deep blocks.  This also means we don't visit unreachable
-  // blocks. 
+  // blocks.
   for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
     bool BBChanged = false;
     for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 1ec38e56aa4c..e703014bb0e6 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -16,31 +16,55 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/DeadStoreElimination.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstddef>
+#include <iterator>
 #include <map>
+#include <utility>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "dse"
@@ -49,18 +73,23 @@ STATISTIC(NumRedundantStores, "Number of redundant stores deleted");
 STATISTIC(NumFastStores, "Number of stores deleted");
 STATISTIC(NumFastOther , "Number of other instrs removed");
 STATISTIC(NumCompletePartials, "Number of stores dead by later partials");
+STATISTIC(NumModifiedStores, "Number of stores modified");
 
 static cl::opt<bool>
 EnablePartialOverwriteTracking("enable-dse-partial-overwrite-tracking",
   cl::init(true), cl::Hidden,
   cl::desc("Enable partial-overwrite tracking in DSE"));
 
+static cl::opt<bool>
+EnablePartialStoreMerging("enable-dse-partial-store-merging",
+  cl::init(true), cl::Hidden,
+  cl::desc("Enable partial store merging in DSE"));
 
 //===----------------------------------------------------------------------===//
 // Helper functions
 //===----------------------------------------------------------------------===//
-typedef std::map<int64_t, int64_t> OverlapIntervalsTy;
-typedef DenseMap<Instruction *, OverlapIntervalsTy> InstOverlapIntervalsTy;
+using OverlapIntervalsTy = std::map<int64_t, int64_t>;
+using InstOverlapIntervalsTy = DenseMap<Instruction *, OverlapIntervalsTy>;
 
 /// Delete this instruction.  Before we do, go through and zero out all the
 /// operands of this instruction.  If any of them become dead, delete them and
@@ -209,7 +238,6 @@ static bool isRemovable(Instruction *I) {
     case Intrinsic::init_trampoline:
       // Always safe to remove init_trampoline.
       return true;
-
     case Intrinsic::memset:
     case Intrinsic::memmove:
     case Intrinsic::memcpy:
@@ -224,7 +252,6 @@ static bool isRemovable(Instruction *I) {
   return false;
 }
 
-
 /// Returns true if the end of this instruction can be safely shortened in
 /// length.
 static bool isShortenableAtTheEnd(Instruction *I) {
@@ -287,14 +314,24 @@ static uint64_t getPointerSize(const Value *V, const DataLayout &DL,
 }
 
 namespace {
-enum OverwriteResult { OW_Begin, OW_Complete, OW_End, OW_Unknown };
-}
+
+enum OverwriteResult {
+  OW_Begin,
+  OW_Complete,
+  OW_End,
+  OW_PartialEarlierWithFullLater,
+  OW_Unknown
+};
+
+} // end anonymous namespace
 
 /// Return 'OW_Complete' if a store to the 'Later' location completely
 /// overwrites a store to the 'Earlier' location, 'OW_End' if the end of the
 /// 'Earlier' location is completely overwritten by 'Later', 'OW_Begin' if the
-/// beginning of the 'Earlier' location is overwritten by 'Later', or
-/// 'OW_Unknown' if nothing can be determined.
+/// beginning of the 'Earlier' location is overwritten by 'Later'.
+/// 'OW_PartialEarlierWithFullLater' means that an earlier (big) store was
+/// overwritten by a latter (smaller) store which doesn't write outside the big
+/// store's memory locations. Returns 'OW_Unknown' if nothing can be determined.
 static OverwriteResult isOverwrite(const MemoryLocation &Later,
                                    const MemoryLocation &Earlier,
                                    const DataLayout &DL,
@@ -427,6 +464,19 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
     }
   }
 
+  // Check for an earlier store which writes to all the memory locations that
+  // the later store writes to.
+  if (EnablePartialStoreMerging && LaterOff >= EarlierOff &&
+      int64_t(EarlierOff + Earlier.Size) > LaterOff &&
+      uint64_t(LaterOff - EarlierOff) + Later.Size <= Earlier.Size) {
+    DEBUG(dbgs() << "DSE: Partial overwrite an earlier load [" << EarlierOff
+                 << ", " << int64_t(EarlierOff + Earlier.Size)
+                 << ") by a later store [" << LaterOff << ", "
+                 << int64_t(LaterOff + Later.Size) << ")\n");
+    // TODO: Maybe come up with a better name?
+    return OW_PartialEarlierWithFullLater;
+  }
+
   // Another interesting case is if the later store overwrites the end of the
   // earlier store.
   //
@@ -544,11 +594,9 @@ static bool memoryIsNotModifiedBetween(Instruction *FirstI,
     }
     for (; BI != EI; ++BI) {
       Instruction *I = &*BI;
-      if (I->mayWriteToMemory() && I != SecondI) {
-        auto Res = AA->getModRefInfo(I, MemLoc);
-        if (Res & MRI_Mod)
+      if (I->mayWriteToMemory() && I != SecondI)
+        if (isModSet(AA->getModRefInfo(I, MemLoc)))
           return false;
-      }
     }
     if (B != FirstBB) {
       assert(B != &FirstBB->getParent()->getEntryBlock() &&
@@ -772,9 +820,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
       // the call is live.
       DeadStackObjects.remove_if([&](Value *I) {
         // See if the call site touches the value.
-        ModRefInfo A = AA->getModRefInfo(CS, I, getPointerSize(I, DL, *TLI));
-
-        return A == MRI_ModRef || A == MRI_Ref;
+        return isRefSet(AA->getModRefInfo(CS, I, getPointerSize(I, DL, *TLI)));
       });
 
       // If all of the allocas were clobbered by the call then we're not going
@@ -840,7 +886,7 @@ static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset,
   if (!IsOverwriteEnd)
     LaterOffset = int64_t(LaterOffset + LaterSize);
 
-  if (!(llvm::isPowerOf2_64(LaterOffset) && EarlierWriteAlign <= LaterOffset) &&
+  if (!(isPowerOf2_64(LaterOffset) && EarlierWriteAlign <= LaterOffset) &&
       !((EarlierWriteAlign != 0) && LaterOffset % EarlierWriteAlign == 0))
     return false;
 
@@ -1094,6 +1140,8 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
       // If we find a write that is a) removable (i.e., non-volatile), b) is
       // completely obliterated by the store to 'Loc', and c) which we know that
       // 'Inst' doesn't load from, then we can remove it.
+      // Also try to merge two stores if a later one only touches memory written
+      // to by the earlier one.
       if (isRemovable(DepWrite) &&
           !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) {
         int64_t InstWriteOffset, DepWriteOffset;
@@ -1123,6 +1171,72 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
           bool IsOverwriteEnd = (OR == OW_End);
           MadeChange |= tryToShorten(DepWrite, DepWriteOffset, EarlierSize,
                                     InstWriteOffset, LaterSize, IsOverwriteEnd);
+        } else if (EnablePartialStoreMerging &&
+                   OR == OW_PartialEarlierWithFullLater) {
+          auto *Earlier = dyn_cast<StoreInst>(DepWrite);
+          auto *Later = dyn_cast<StoreInst>(Inst);
+          if (Earlier && isa<ConstantInt>(Earlier->getValueOperand()) &&
+              Later && isa<ConstantInt>(Later->getValueOperand())) {
+            // If the store we find is:
+            //   a) partially overwritten by the store to 'Loc'
+            //   b) the later store is fully contained in the earlier one and
+            //   c) they both have a constant value
+            // Merge the two stores, replacing the earlier store's value with a
+            // merge of both values.
+            // TODO: Deal with other constant types (vectors, etc), and probably
+            // some mem intrinsics (if needed)
+
+            APInt EarlierValue =
+                cast<ConstantInt>(Earlier->getValueOperand())->getValue();
+            APInt LaterValue =
+                cast<ConstantInt>(Later->getValueOperand())->getValue();
+            unsigned LaterBits = LaterValue.getBitWidth();
+            assert(EarlierValue.getBitWidth() > LaterValue.getBitWidth());
+            LaterValue = LaterValue.zext(EarlierValue.getBitWidth());
+
+            // Offset of the smaller store inside the larger store
+            unsigned BitOffsetDiff = (InstWriteOffset - DepWriteOffset) * 8;
+            unsigned LShiftAmount =
+                DL.isBigEndian()
+                    ? EarlierValue.getBitWidth() - BitOffsetDiff - LaterBits
+                    : BitOffsetDiff;
+            APInt Mask =
+                APInt::getBitsSet(EarlierValue.getBitWidth(), LShiftAmount,
+                                  LShiftAmount + LaterBits);
+            // Clear the bits we'll be replacing, then OR with the smaller
+            // store, shifted appropriately.
+            APInt Merged =
+                (EarlierValue & ~Mask) | (LaterValue << LShiftAmount);
+            DEBUG(dbgs() << "DSE: Merge Stores:\n  Earlier: " << *DepWrite
+                         << "\n  Later: " << *Inst
+                         << "\n  Merged Value: " << Merged << '\n');
+
+            auto *SI = new StoreInst(
+                ConstantInt::get(Earlier->getValueOperand()->getType(), Merged),
+                Earlier->getPointerOperand(), false, Earlier->getAlignment(),
+                Earlier->getOrdering(), Earlier->getSyncScopeID(), DepWrite);
+
+            unsigned MDToKeep[] = {LLVMContext::MD_dbg, LLVMContext::MD_tbaa,
+                                   LLVMContext::MD_alias_scope,
+                                   LLVMContext::MD_noalias,
+                                   LLVMContext::MD_nontemporal};
+            SI->copyMetadata(*DepWrite, MDToKeep);
+            ++NumModifiedStores;
+
+            // Remove earlier, wider, store
+            size_t Idx = InstrOrdering.lookup(DepWrite);
+            InstrOrdering.erase(DepWrite);
+            InstrOrdering.insert(std::make_pair(SI, Idx));
+
+            // Delete the old stores and now-dead instructions that feed them.
+            deleteDeadInstruction(Inst, &BBI, *MD, *TLI, IOL, &InstrOrdering);
+            deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL,
+                                  &InstrOrdering);
+            MadeChange = true;
+
+            // We erased DepWrite and Inst (Loc); start over.
+            break;
+          }
         }
       }
 
@@ -1137,7 +1251,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
       if (DepWrite == &BB.front()) break;
 
       // Can't look past this instruction if it might read 'Loc'.
-      if (AA->getModRefInfo(DepWrite, Loc) & MRI_Ref)
+      if (isRefSet(AA->getModRefInfo(DepWrite, Loc)))
         break;
 
       InstDep = MD->getPointerDependencyFrom(Loc, /*isLoad=*/ false,
@@ -1190,9 +1304,12 @@ PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
 }
 
 namespace {
+
 /// A legacy pass for the legacy pass manager that wraps \c DSEPass.
 class DSELegacyPass : public FunctionPass {
 public:
+  static char ID; // Pass identification, replacement for typeid
+
   DSELegacyPass() : FunctionPass(ID) {
     initializeDSELegacyPassPass(*PassRegistry::getPassRegistry());
   }
@@ -1221,12 +1338,12 @@ public:
     AU.addPreserved<GlobalsAAWrapperPass>();
     AU.addPreserved<MemoryDependenceWrapperPass>();
   }
-
-  static char ID; // Pass identification, replacement for typeid
 };
+
 } // end anonymous namespace
 
 char DSELegacyPass::ID = 0;
+
 INITIALIZE_PASS_BEGIN(DSELegacyPass, "dse", "Dead Store Elimination", false,
                       false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
diff --git a/lib/Transforms/Scalar/DivRemPairs.cpp b/lib/Transforms/Scalar/DivRemPairs.cpp
new file mode 100644
index 000000000000..e383af89a384
--- /dev/null
+++ b/lib/Transforms/Scalar/DivRemPairs.cpp
@@ -0,0 +1,206 @@
+//===- DivRemPairs.cpp - Hoist/decompose division and remainder -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass hoists and/or decomposes integer division and remainder
+// instructions to enable CFG improvements and better codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/DivRemPairs.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BypassSlowDivision.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "div-rem-pairs"
+STATISTIC(NumPairs, "Number of div/rem pairs");
+STATISTIC(NumHoisted, "Number of instructions hoisted");
+STATISTIC(NumDecomposed, "Number of instructions decomposed");
+
+/// Find matching pairs of integer div/rem ops (they have the same numerator,
+/// denominator, and signedness). If they exist in different basic blocks, bring
+/// them together by hoisting or replace the common division operation that is
+/// implicit in the remainder:
+/// X % Y <--> X - ((X / Y) * Y).
+///
+/// We can largely ignore the normal safety and cost constraints on speculation
+/// of these ops when we find a matching pair. This is because we are already
+/// guaranteed that any exceptions and most cost are already incurred by the
+/// first member of the pair.
+///
+/// Note: This transform could be an oddball enhancement to EarlyCSE, GVN, or
+/// SimplifyCFG, but it's split off on its own because it's different enough
+/// that it doesn't quite match the stated objectives of those passes.
+static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
+                           const DominatorTree &DT) {
+  bool Changed = false;
+
+  // Insert all divide and remainder instructions into maps keyed by their
+  // operands and opcode (signed or unsigned).
+  DenseMap<DivRemMapKey, Instruction *> DivMap, RemMap;
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      if (I.getOpcode() == Instruction::SDiv)
+        DivMap[DivRemMapKey(true, I.getOperand(0), I.getOperand(1))] = &I;
+      else if (I.getOpcode() == Instruction::UDiv)
+        DivMap[DivRemMapKey(false, I.getOperand(0), I.getOperand(1))] = &I;
+      else if (I.getOpcode() == Instruction::SRem)
+        RemMap[DivRemMapKey(true, I.getOperand(0), I.getOperand(1))] = &I;
+      else if (I.getOpcode() == Instruction::URem)
+        RemMap[DivRemMapKey(false, I.getOperand(0), I.getOperand(1))] = &I;
+    }
+  }
+
+  // We can iterate over either map because we are only looking for matched
+  // pairs. Choose remainders for efficiency because they are usually even more
+  // rare than division.
+  for (auto &RemPair : RemMap) {
+    // Find the matching division instruction from the division map.
+    Instruction *DivInst = DivMap[RemPair.getFirst()];
+    if (!DivInst)
+      continue;
+
+    // We have a matching pair of div/rem instructions. If one dominates the
+    // other, hoist and/or replace one.
+    NumPairs++;
+    Instruction *RemInst = RemPair.getSecond();
+    bool IsSigned = DivInst->getOpcode() == Instruction::SDiv;
+    bool HasDivRemOp = TTI.hasDivRemOp(DivInst->getType(), IsSigned);
+
+    // If the target supports div+rem and the instructions are in the same block
+    // already, there's nothing to do. The backend should handle this. If the
+    // target does not support div+rem, then we will decompose the rem.
+    if (HasDivRemOp && RemInst->getParent() == DivInst->getParent())
+      continue;
+
+    bool DivDominates = DT.dominates(DivInst, RemInst);
+    if (!DivDominates && !DT.dominates(RemInst, DivInst))
+      continue;
+
+    if (HasDivRemOp) {
+      // The target has a single div/rem operation. Hoist the lower instruction
+      // to make the matched pair visible to the backend.
+      if (DivDominates)
+        RemInst->moveAfter(DivInst);
+      else
+        DivInst->moveAfter(RemInst);
+      NumHoisted++;
+    } else {
+      // The target does not have a single div/rem operation. Decompose the
+      // remainder calculation as:
+      // X % Y --> X - ((X / Y) * Y).
+      Value *X = RemInst->getOperand(0);
+      Value *Y = RemInst->getOperand(1);
+      Instruction *Mul = BinaryOperator::CreateMul(DivInst, Y);
+      Instruction *Sub = BinaryOperator::CreateSub(X, Mul);
+
+      // If the remainder dominates, then hoist the division up to that block:
+      //
+      // bb1:
+      //   %rem = srem %x, %y
+      // bb2:
+      //   %div = sdiv %x, %y
+      // -->
+      // bb1:
+      //   %div = sdiv %x, %y
+      //   %mul = mul %div, %y
+      //   %rem = sub %x, %mul
+      //
+      // If the division dominates, it's already in the right place. The mul+sub
+      // will be in a different block because we don't assume that they are
+      // cheap to speculatively execute:
+      //
+      // bb1:
+      //   %div = sdiv %x, %y
+      // bb2:
+      //   %rem = srem %x, %y
+      // -->
+      // bb1:
+      //   %div = sdiv %x, %y
+      // bb2:
+      //   %mul = mul %div, %y
+      //   %rem = sub %x, %mul
+      //
+      // If the div and rem are in the same block, we do the same transform,
+      // but any code movement would be within the same block.
+
+      if (!DivDominates)
+        DivInst->moveBefore(RemInst);
+      Mul->insertAfter(RemInst);
+      Sub->insertAfter(Mul);
+
+      // Now kill the explicit remainder. We have replaced it with:
+      // (sub X, (mul (div X, Y), Y)
+      RemInst->replaceAllUsesWith(Sub);
+      RemInst->eraseFromParent();
+      NumDecomposed++;
+    }
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+// Pass manager boilerplate below here.
+
+namespace {
+struct DivRemPairsLegacyPass : public FunctionPass {
+  static char ID;
+  DivRemPairsLegacyPass() : FunctionPass(ID) {
+    initializeDivRemPairsLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.setPreservesCFG();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    return optimizeDivRem(F, TTI, DT);
+  }
+};
+}
+
+char DivRemPairsLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(DivRemPairsLegacyPass, "div-rem-pairs",
+                      "Hoist/decompose integer division and remainder", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(DivRemPairsLegacyPass, "div-rem-pairs",
+                    "Hoist/decompose integer division and remainder", false,
+                    false)
+FunctionPass *llvm::createDivRemPairsPass() {
+  return new DivRemPairsLegacyPass();
+}
+
+PreservedAnalyses DivRemPairsPass::run(Function &F,
+                                       FunctionAnalysisManager &FAM) {
+  TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
+  DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+  if (!optimizeDivRem(F, TTI, DT))
+    return PreservedAnalyses::all();
+  // TODO: This pass just hoists/replaces math ops - all analyses are preserved?
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index c5c9b2c185d6..5798e1c4ee99 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -13,9 +13,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/EarlyCSE.h"
+#include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopedHashTable.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/GlobalsModRef.h"
@@ -24,18 +27,37 @@
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/RecyclingAllocator.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
 #include <deque>
+#include <memory>
+#include <utility>
+
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
@@ -53,6 +75,7 @@ STATISTIC(NumDSE,      "Number of trivial dead stores removed");
 //===----------------------------------------------------------------------===//
 
 namespace {
+
 /// \brief Struct representing the available values in the scoped hash table.
 struct SimpleValue {
   Instruction *Inst;
@@ -77,20 +100,25 @@ struct SimpleValue {
            isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst);
   }
 };
-}
+
+} // end anonymous namespace
 
 namespace llvm {
+
 template <> struct DenseMapInfo<SimpleValue> {
   static inline SimpleValue getEmptyKey() {
     return DenseMapInfo<Instruction *>::getEmptyKey();
   }
+
   static inline SimpleValue getTombstoneKey() {
     return DenseMapInfo<Instruction *>::getTombstoneKey();
   }
+
   static unsigned getHashValue(SimpleValue Val);
   static bool isEqual(SimpleValue LHS, SimpleValue RHS);
 };
-}
+
+} // end namespace llvm
 
 unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
   Instruction *Inst = Val.Inst;
@@ -115,6 +143,21 @@ unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
     return hash_combine(Inst->getOpcode(), Pred, LHS, RHS);
   }
 
+  // Hash min/max/abs (cmp + select) to allow for commuted operands.
+  // Min/max may also have non-canonical compare predicate (eg, the compare for
+  // smin may use 'sgt' rather than 'slt'), and non-canonical operands in the
+  // compare.
+  Value *A, *B;
+  SelectPatternFlavor SPF = matchSelectPattern(Inst, A, B).Flavor;
+  // TODO: We should also detect FP min/max.
+  if (SPF == SPF_SMIN || SPF == SPF_SMAX ||
+      SPF == SPF_UMIN || SPF == SPF_UMAX ||
+      SPF == SPF_ABS || SPF == SPF_NABS) {
+    if (A > B)
+      std::swap(A, B);
+    return hash_combine(Inst->getOpcode(), SPF, A, B);
+  }
+
   if (CastInst *CI = dyn_cast<CastInst>(Inst))
     return hash_combine(CI->getOpcode(), CI->getType(), CI->getOperand(0));
 
@@ -173,6 +216,20 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
            LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate();
   }
 
+  // Min/max/abs can occur with commuted operands, non-canonical predicates,
+  // and/or non-canonical operands.
+  Value *LHSA, *LHSB;
+  SelectPatternFlavor LSPF = matchSelectPattern(LHSI, LHSA, LHSB).Flavor;
+  // TODO: We should also detect FP min/max.
+  if (LSPF == SPF_SMIN || LSPF == SPF_SMAX ||
+      LSPF == SPF_UMIN || LSPF == SPF_UMAX ||
+      LSPF == SPF_ABS || LSPF == SPF_NABS) {
+    Value *RHSA, *RHSB;
+    SelectPatternFlavor RSPF = matchSelectPattern(RHSI, RHSA, RHSB).Flavor;
+    return (LSPF == RSPF && ((LHSA == RHSA && LHSB == RHSB) ||
+                             (LHSA == RHSB && LHSB == RHSA)));
+  }
+
   return false;
 }
 
@@ -181,6 +238,7 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
 //===----------------------------------------------------------------------===//
 
 namespace {
+
 /// \brief Struct representing the available call values in the scoped hash
 /// table.
 struct CallValue {
@@ -206,20 +264,25 @@ struct CallValue {
     return true;
   }
 };
-}
+
+} // end anonymous namespace
 
 namespace llvm {
+
 template <> struct DenseMapInfo<CallValue> {
   static inline CallValue getEmptyKey() {
     return DenseMapInfo<Instruction *>::getEmptyKey();
   }
+
   static inline CallValue getTombstoneKey() {
     return DenseMapInfo<Instruction *>::getTombstoneKey();
   }
+
   static unsigned getHashValue(CallValue Val);
   static bool isEqual(CallValue LHS, CallValue RHS);
 };
-}
+
+} // end namespace llvm
 
 unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) {
   Instruction *Inst = Val.Inst;
@@ -241,6 +304,7 @@ bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) {
 //===----------------------------------------------------------------------===//
 
 namespace {
+
 /// \brief A simple and fast domtree-based CSE pass.
 ///
 /// This pass does a simple depth-first walk over the dominator tree,
@@ -257,10 +321,13 @@ public:
   const SimplifyQuery SQ;
   MemorySSA *MSSA;
   std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
-  typedef RecyclingAllocator<
-      BumpPtrAllocator, ScopedHashTableVal<SimpleValue, Value *>> AllocatorTy;
-  typedef ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>,
-                          AllocatorTy> ScopedHTType;
+
+  using AllocatorTy =
+      RecyclingAllocator<BumpPtrAllocator,
+                         ScopedHashTableVal<SimpleValue, Value *>>;
+  using ScopedHTType =
+      ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>,
+                      AllocatorTy>;
 
   /// \brief A scoped hash table of the current values of all of our simple
   /// scalar expressions.
@@ -285,44 +352,45 @@ public:
   /// present the table; it is the responsibility of the consumer to inspect
   /// the atomicity/volatility if needed.
   struct LoadValue {
-    Instruction *DefInst;
-    unsigned Generation;
-    int MatchingId;
-    bool IsAtomic;
-    bool IsInvariant;
-    LoadValue()
-        : DefInst(nullptr), Generation(0), MatchingId(-1), IsAtomic(false),
-          IsInvariant(false) {}
+    Instruction *DefInst = nullptr;
+    unsigned Generation = 0;
+    int MatchingId = -1;
+    bool IsAtomic = false;
+    bool IsInvariant = false;
+
+    LoadValue() = default;
     LoadValue(Instruction *Inst, unsigned Generation, unsigned MatchingId,
               bool IsAtomic, bool IsInvariant)
         : DefInst(Inst), Generation(Generation), MatchingId(MatchingId),
           IsAtomic(IsAtomic), IsInvariant(IsInvariant) {}
   };
-  typedef RecyclingAllocator<BumpPtrAllocator,
-                             ScopedHashTableVal<Value *, LoadValue>>
-      LoadMapAllocator;
-  typedef ScopedHashTable<Value *, LoadValue, DenseMapInfo<Value *>,
-                          LoadMapAllocator> LoadHTType;
+
+  using LoadMapAllocator =
+      RecyclingAllocator<BumpPtrAllocator,
+                         ScopedHashTableVal<Value *, LoadValue>>;
+  using LoadHTType =
+      ScopedHashTable<Value *, LoadValue, DenseMapInfo<Value *>,
+                      LoadMapAllocator>;
+
   LoadHTType AvailableLoads;
 
   /// \brief A scoped hash table of the current values of read-only call
   /// values.
   ///
   /// It uses the same generation count as loads.
-  typedef ScopedHashTable<CallValue, std::pair<Instruction *, unsigned>>
-      CallHTType;
+  using CallHTType =
+      ScopedHashTable<CallValue, std::pair<Instruction *, unsigned>>;
   CallHTType AvailableCalls;
 
   /// \brief This is the current generation of the memory value.
-  unsigned CurrentGeneration;
+  unsigned CurrentGeneration = 0;
 
   /// \brief Set up the EarlyCSE runner for a particular function.
   EarlyCSE(const DataLayout &DL, const TargetLibraryInfo &TLI,
            const TargetTransformInfo &TTI, DominatorTree &DT,
            AssumptionCache &AC, MemorySSA *MSSA)
       : TLI(TLI), TTI(TTI), DT(DT), AC(AC), SQ(DL, &TLI, &DT, &AC), MSSA(MSSA),
-        MSSAUpdater(make_unique<MemorySSAUpdater>(MSSA)), CurrentGeneration(0) {
-  }
+        MSSAUpdater(llvm::make_unique<MemorySSAUpdater>(MSSA)) {}
 
   bool run();
 
@@ -336,11 +404,10 @@ private:
               CallHTType &AvailableCalls)
         : Scope(AvailableValues), LoadScope(AvailableLoads),
           CallScope(AvailableCalls) {}
-
-  private:
     NodeScope(const NodeScope &) = delete;
-    void operator=(const NodeScope &) = delete;
+    NodeScope &operator=(const NodeScope &) = delete;
 
+  private:
     ScopedHTType::ScopeTy Scope;
     LoadHTType::ScopeTy LoadScope;
     CallHTType::ScopeTy CallScope;
@@ -356,8 +423,10 @@ private:
               CallHTType &AvailableCalls, unsigned cg, DomTreeNode *n,
               DomTreeNode::iterator child, DomTreeNode::iterator end)
         : CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child),
-          EndIter(end), Scopes(AvailableValues, AvailableLoads, AvailableCalls),
-          Processed(false) {}
+          EndIter(end), Scopes(AvailableValues, AvailableLoads, AvailableCalls)
+          {}
+    StackNode(const StackNode &) = delete;
+    StackNode &operator=(const StackNode &) = delete;
 
     // Accessors.
     unsigned currentGeneration() { return CurrentGeneration; }
@@ -365,27 +434,25 @@ private:
     void childGeneration(unsigned generation) { ChildGeneration = generation; }
     DomTreeNode *node() { return Node; }
     DomTreeNode::iterator childIter() { return ChildIter; }
+
     DomTreeNode *nextChild() {
       DomTreeNode *child = *ChildIter;
       ++ChildIter;
       return child;
     }
+
     DomTreeNode::iterator end() { return EndIter; }
     bool isProcessed() { return Processed; }
     void process() { Processed = true; }
 
   private:
-    StackNode(const StackNode &) = delete;
-    void operator=(const StackNode &) = delete;
-
-    // Members.
     unsigned CurrentGeneration;
     unsigned ChildGeneration;
     DomTreeNode *Node;
     DomTreeNode::iterator ChildIter;
     DomTreeNode::iterator EndIter;
     NodeScope Scopes;
-    bool Processed;
+    bool Processed = false;
   };
 
   /// \brief Wrapper class to handle memory instructions, including loads,
@@ -393,24 +460,28 @@ private:
   class ParseMemoryInst {
   public:
     ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI)
-      : IsTargetMemInst(false), Inst(Inst) {
+      : Inst(Inst) {
       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
         if (TTI.getTgtMemIntrinsic(II, Info))
           IsTargetMemInst = true;
     }
+
     bool isLoad() const {
       if (IsTargetMemInst) return Info.ReadMem;
       return isa<LoadInst>(Inst);
     }
+
     bool isStore() const {
       if (IsTargetMemInst) return Info.WriteMem;
       return isa<StoreInst>(Inst);
     }
+
     bool isAtomic() const {
       if (IsTargetMemInst)
         return Info.Ordering != AtomicOrdering::NotAtomic;
       return Inst->isAtomic();
     }
+
     bool isUnordered() const {
       if (IsTargetMemInst)
         return Info.isUnordered();
@@ -447,6 +518,7 @@ private:
       return (getPointerOperand() == Inst.getPointerOperand() &&
               getMatchingId() == Inst.getMatchingId());
     }
+
     bool isValid() const { return getPointerOperand() != nullptr; }
 
     // For regular (non-intrinsic) loads/stores, this is set to -1. For
@@ -457,6 +529,7 @@ private:
       if (IsTargetMemInst) return Info.MatchingId;
       return -1;
     }
+
     Value *getPointerOperand() const {
       if (IsTargetMemInst) return Info.PtrVal;
       if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
@@ -466,17 +539,19 @@ private:
       }
       return nullptr;
     }
+
     bool mayReadFromMemory() const {
       if (IsTargetMemInst) return Info.ReadMem;
       return Inst->mayReadFromMemory();
     }
+
     bool mayWriteToMemory() const {
       if (IsTargetMemInst) return Info.WriteMem;
       return Inst->mayWriteToMemory();
     }
 
   private:
-    bool IsTargetMemInst;
+    bool IsTargetMemInst = false;
     MemIntrinsicInfo Info;
     Instruction *Inst;
   };
@@ -524,8 +599,8 @@ private:
 
         for (MemoryPhi *MP : PhisToCheck) {
           MemoryAccess *FirstIn = MP->getIncomingValue(0);
-          if (all_of(MP->incoming_values(),
-                     [=](Use &In) { return In == FirstIn; }))
+          if (llvm::all_of(MP->incoming_values(),
+                           [=](Use &In) { return In == FirstIn; }))
             WorkQueue.push_back(MP);
         }
         PhisToCheck.clear();
@@ -533,7 +608,8 @@ private:
     }
   }
 };
-}
+
+} // end anonymous namespace
 
 /// Determine if the memory referenced by LaterInst is from the same heap
 /// version as EarlierInst.
@@ -663,6 +739,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       continue;
     }
 
+    // Skip sideeffect intrinsics, for the same reason as assume intrinsics.
+    if (match(Inst, m_Intrinsic<Intrinsic::sideeffect>())) {
+      DEBUG(dbgs() << "EarlyCSE skipping sideeffect: " << *Inst << '\n');
+      continue;
+    }
+
     // Skip invariant.start intrinsics since they only read memory, and we can
     // forward values across it. Also, we dont need to consume the last store
     // since the semantics of invariant.start allow us to perform DSE of the
@@ -1014,6 +1096,7 @@ PreservedAnalyses EarlyCSEPass::run(Function &F,
 }
 
 namespace {
+
 /// \brief A simple and fast domtree-based CSE pass.
 ///
 /// This pass does a simple depth-first walk over the dominator tree,
@@ -1062,7 +1145,8 @@ public:
     AU.setPreservesCFG();
   }
 };
-}
+
+} // end anonymous namespace
 
 using EarlyCSELegacyPass = EarlyCSELegacyCommonPass</*UseMemorySSA=*/false>;
 
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index ea28705e684d..e2c1eaf58e43 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -20,39 +20,64 @@
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
-#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/PHITransAddr.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Dominators.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Transforms/Utils/VNCoercion.h"
-
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <utility>
 #include <vector>
+
 using namespace llvm;
 using namespace llvm::gvn;
 using namespace llvm::VNCoercion;
@@ -80,6 +105,7 @@ MaxRecurseDepth("max-recurse-depth", cl::Hidden, cl::init(1000), cl::ZeroOrMore,
 struct llvm::GVN::Expression {
   uint32_t opcode;
   Type *type;
+  bool commutative = false;
   SmallVector<uint32_t, 4> varargs;
 
   Expression(uint32_t o = ~2U) : opcode(o) {}
@@ -104,20 +130,23 @@ struct llvm::GVN::Expression {
 };
 
 namespace llvm {
+
 template <> struct DenseMapInfo<GVN::Expression> {
   static inline GVN::Expression getEmptyKey() { return ~0U; }
-
   static inline GVN::Expression getTombstoneKey() { return ~1U; }
 
   static unsigned getHashValue(const GVN::Expression &e) {
     using llvm::hash_value;
+
     return static_cast<unsigned>(hash_value(e));
   }
+
   static bool isEqual(const GVN::Expression &LHS, const GVN::Expression &RHS) {
     return LHS == RHS;
   }
 };
-} // End llvm namespace.
+
+} // end namespace llvm
 
 /// Represents a particular available value that we know how to materialize.
 /// Materialization of an AvailableValue never fails.  An AvailableValue is
@@ -216,6 +245,7 @@ struct llvm::gvn::AvailableValueInBlock {
                                    unsigned Offset = 0) {
     return get(BB, AvailableValue::get(V, Offset));
   }
+
   static AvailableValueInBlock getUndef(BasicBlock *BB) {
     return get(BB, AvailableValue::getUndef());
   }
@@ -246,6 +276,7 @@ GVN::Expression GVN::ValueTable::createExpr(Instruction *I) {
     assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!");
     if (e.varargs[0] > e.varargs[1])
       std::swap(e.varargs[0], e.varargs[1]);
+    e.commutative = true;
   }
 
   if (CmpInst *C = dyn_cast<CmpInst>(I)) {
@@ -256,6 +287,7 @@ GVN::Expression GVN::ValueTable::createExpr(Instruction *I) {
       Predicate = CmpInst::getSwappedPredicate(Predicate);
     }
     e.opcode = (C->getOpcode() << 8) | Predicate;
+    e.commutative = true;
   } else if (InsertValueInst *E = dyn_cast<InsertValueInst>(I)) {
     for (InsertValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end();
          II != IE; ++II)
@@ -281,6 +313,7 @@ GVN::Expression GVN::ValueTable::createCmpExpr(unsigned Opcode,
     Predicate = CmpInst::getSwappedPredicate(Predicate);
   }
   e.opcode = (Opcode << 8) | Predicate;
+  e.commutative = true;
   return e;
 }
 
@@ -340,7 +373,7 @@ GVN::Expression GVN::ValueTable::createExtractvalueExpr(ExtractValueInst *EI) {
 //                     ValueTable External Functions
 //===----------------------------------------------------------------------===//
 
-GVN::ValueTable::ValueTable() : nextValueNumber(1) {}
+GVN::ValueTable::ValueTable() = default;
 GVN::ValueTable::ValueTable(const ValueTable &) = default;
 GVN::ValueTable::ValueTable(ValueTable &&) = default;
 GVN::ValueTable::~ValueTable() = default;
@@ -348,25 +381,25 @@ GVN::ValueTable::~ValueTable() = default;
 /// add - Insert a value into the table with a specified value number.
 void GVN::ValueTable::add(Value *V, uint32_t num) {
   valueNumbering.insert(std::make_pair(V, num));
+  if (PHINode *PN = dyn_cast<PHINode>(V))
+    NumberingPhi[num] = PN;
 }
 
 uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) {
   if (AA->doesNotAccessMemory(C)) {
     Expression exp = createExpr(C);
-    uint32_t &e = expressionNumbering[exp];
-    if (!e) e = nextValueNumber++;
+    uint32_t e = assignExpNewValueNum(exp).first;
     valueNumbering[C] = e;
     return e;
   } else if (AA->onlyReadsMemory(C)) {
     Expression exp = createExpr(C);
-    uint32_t &e = expressionNumbering[exp];
-    if (!e) {
-      e = nextValueNumber++;
-      valueNumbering[C] = e;
-      return e;
+    auto ValNum = assignExpNewValueNum(exp);
+    if (ValNum.second) {
+      valueNumbering[C] = ValNum.first;
+      return ValNum.first;
     }
     if (!MD) {
-      e = nextValueNumber++;
+      uint32_t e = assignExpNewValueNum(exp).first;
       valueNumbering[C] = e;
       return e;
     }
@@ -452,7 +485,6 @@ uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) {
     uint32_t v = lookupOrAdd(cdep);
     valueNumbering[C] = v;
     return v;
-
   } else {
     valueNumbering[C] = nextValueNumber;
     return nextValueNumber++;
@@ -522,23 +554,29 @@ uint32_t GVN::ValueTable::lookupOrAdd(Value *V) {
     case Instruction::ExtractValue:
       exp = createExtractvalueExpr(cast<ExtractValueInst>(I));
       break;
+    case Instruction::PHI:
+      valueNumbering[V] = nextValueNumber;
+      NumberingPhi[nextValueNumber] = cast<PHINode>(V);
+      return nextValueNumber++;
     default:
       valueNumbering[V] = nextValueNumber;
       return nextValueNumber++;
   }
 
-  uint32_t& e = expressionNumbering[exp];
-  if (!e) e = nextValueNumber++;
+  uint32_t e = assignExpNewValueNum(exp).first;
   valueNumbering[V] = e;
   return e;
 }
 
 /// Returns the value number of the specified value. Fails if
 /// the value has not yet been numbered.
-uint32_t GVN::ValueTable::lookup(Value *V) const {
+uint32_t GVN::ValueTable::lookup(Value *V, bool Verify) const {
   DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V);
-  assert(VI != valueNumbering.end() && "Value not numbered?");
-  return VI->second;
+  if (Verify) {
+    assert(VI != valueNumbering.end() && "Value not numbered?");
+    return VI->second;
+  }
+  return (VI != valueNumbering.end()) ? VI->second : 0;
 }
 
 /// Returns the value number of the given comparison,
@@ -549,21 +587,28 @@ uint32_t GVN::ValueTable::lookupOrAddCmp(unsigned Opcode,
                                          CmpInst::Predicate Predicate,
                                          Value *LHS, Value *RHS) {
   Expression exp = createCmpExpr(Opcode, Predicate, LHS, RHS);
-  uint32_t& e = expressionNumbering[exp];
-  if (!e) e = nextValueNumber++;
-  return e;
+  return assignExpNewValueNum(exp).first;
 }
 
 /// Remove all entries from the ValueTable.
 void GVN::ValueTable::clear() {
   valueNumbering.clear();
   expressionNumbering.clear();
+  NumberingPhi.clear();
+  PhiTranslateTable.clear();
   nextValueNumber = 1;
+  Expressions.clear();
+  ExprIdx.clear();
+  nextExprNumber = 0;
 }
 
 /// Remove a value from the value numbering.
 void GVN::ValueTable::erase(Value *V) {
+  uint32_t Num = valueNumbering.lookup(V);
   valueNumbering.erase(V);
+  // If V is PHINode, V <--> value number is an one-to-one mapping.
+  if (isa<PHINode>(V))
+    NumberingPhi.erase(Num);
 }
 
 /// verifyRemoved - Verify that the value is removed from all internal data
@@ -693,9 +738,6 @@ SpeculationFailure:
   return false;
 }
 
-
-
-
 /// Given a set of loads specified by ValuesPerBlock,
 /// construct SSA form, allowing us to eliminate LI.  This returns the value
 /// that should be used at LI's definition site.
@@ -789,6 +831,7 @@ static void reportMayClobberedLoad(LoadInst *LI, MemDepResult DepInfo,
                                    DominatorTree *DT,
                                    OptimizationRemarkEmitter *ORE) {
   using namespace ore;
+
   User *OtherAccess = nullptr;
 
   OptimizationRemarkMissed R(DEBUG_TYPE, "LoadClobbered", LI);
@@ -817,7 +860,6 @@ static void reportMayClobberedLoad(LoadInst *LI, MemDepResult DepInfo,
 
 bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
                                   Value *Address, AvailableValue &Res) {
-
   assert((DepInfo.isDef() || DepInfo.isClobber()) &&
          "expected a local dependence");
   assert(LI->isUnordered() && "rules below are incorrect for ordered access");
@@ -879,8 +921,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
       Instruction *I = DepInfo.getInst();
       dbgs() << " is clobbered by " << *I << '\n';
     );
-
-    if (ORE->allowExtraAnalysis())
+    if (ORE->allowExtraAnalysis(DEBUG_TYPE))
       reportMayClobberedLoad(LI, DepInfo, DT, ORE);
 
     return false;
@@ -949,7 +990,6 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
 void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
                                   AvailValInBlkVect &ValuesPerBlock,
                                   UnavailBlkVect &UnavailableBlocks) {
-
   // Filter out useless results (non-locals, etc).  Keep track of the blocks
   // where we have a value available in repl, also keep track of whether we see
   // dependencies that produce an unknown value for the load (such as a call
@@ -1009,7 +1049,32 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
   // backwards through predecessors if needed.
   BasicBlock *LoadBB = LI->getParent();
   BasicBlock *TmpBB = LoadBB;
+  bool IsSafeToSpeculativelyExecute = isSafeToSpeculativelyExecute(LI);
 
+  // Check that there is no implicit control flow instructions above our load in
+  // its block. If there is an instruction that doesn't always pass the
+  // execution to the following instruction, then moving through it may become
+  // invalid. For example:
+  //
+  // int arr[LEN];
+  // int index = ???;
+  // ...
+  // guard(0 <= index && index < LEN);
+  // use(arr[index]);
+  //
+  // It is illegal to move the array access to any point above the guard,
+  // because if the index is out of bounds we should deoptimize rather than
+  // access the array.
+  // Check that there is no guard in this block above our intruction.
+  if (!IsSafeToSpeculativelyExecute) {
+    auto It = FirstImplicitControlFlowInsts.find(TmpBB);
+    if (It != FirstImplicitControlFlowInsts.end()) {
+      assert(It->second->getParent() == TmpBB &&
+             "Implicit control flow map broken?");
+      if (OI->dominates(It->second, LI))
+        return false;
+    }
+  }
   while (TmpBB->getSinglePredecessor()) {
     TmpBB = TmpBB->getSinglePredecessor();
     if (TmpBB == LoadBB) // Infinite (unreachable) loop.
@@ -1024,6 +1089,11 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
     // which it was not previously executed.
     if (TmpBB->getTerminator()->getNumSuccessors() != 1)
       return false;
+
+    // Check that there is no implicit control flow in a block above.
+    if (!IsSafeToSpeculativelyExecute &&
+        FirstImplicitControlFlowInsts.count(TmpBB))
+      return false;
   }
 
   assert(TmpBB);
@@ -1128,8 +1198,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
   if (!CanDoPRE) {
     while (!NewInsts.empty()) {
       Instruction *I = NewInsts.pop_back_val();
-      if (MD) MD->removeInstruction(I);
-      I->eraseFromParent();
+      markInstructionForDeletion(I);
     }
     // HINT: Don't revert the edge-splitting as following transformation may
     // also need to split these critical edges.
@@ -1206,8 +1275,10 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
   if (V->getType()->isPtrOrPtrVectorTy())
     MD->invalidateCachedPointerInfo(V);
   markInstructionForDeletion(LI);
-  ORE->emit(OptimizationRemark(DEBUG_TYPE, "LoadPRE", LI)
-            << "load eliminated by PRE");
+  ORE->emit([&]() {
+    return OptimizationRemark(DEBUG_TYPE, "LoadPRE", LI)
+           << "load eliminated by PRE";
+  });
   ++NumPRELoad;
   return true;
 }
@@ -1215,17 +1286,23 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
 static void reportLoadElim(LoadInst *LI, Value *AvailableValue,
                            OptimizationRemarkEmitter *ORE) {
   using namespace ore;
-  ORE->emit(OptimizationRemark(DEBUG_TYPE, "LoadElim", LI)
-            << "load of type " << NV("Type", LI->getType()) << " eliminated"
-            << setExtraArgs() << " in favor of "
-            << NV("InfavorOfValue", AvailableValue));
+
+  ORE->emit([&]() {
+    return OptimizationRemark(DEBUG_TYPE, "LoadElim", LI)
+           << "load of type " << NV("Type", LI->getType()) << " eliminated"
+           << setExtraArgs() << " in favor of "
+           << NV("InfavorOfValue", AvailableValue);
+  });
 }
 
 /// Attempt to eliminate a load whose dependencies are
 /// non-local by performing PHI construction.
 bool GVN::processNonLocalLoad(LoadInst *LI) {
   // non-local speculations are not allowed under asan.
-  if (LI->getParent()->getParent()->hasFnAttribute(Attribute::SanitizeAddress))
+  if (LI->getParent()->getParent()->hasFnAttribute(
+          Attribute::SanitizeAddress) ||
+      LI->getParent()->getParent()->hasFnAttribute(
+          Attribute::SanitizeHWAddress))
     return false;
 
   // Step 1: Find the non-local dependencies of the load.
@@ -1322,6 +1399,11 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
     }
     markInstructionForDeletion(IntrinsicI);
     return false;
+  } else if (isa<Constant>(V)) {
+    // If it's not false, and constant, it must evaluate to true. This means our
+    // assume is assume(true), and thus, pointless, and we don't want to do
+    // anything more here.
+    return false;
   }
 
   Constant *True = ConstantInt::getTrue(V->getContext());
@@ -1452,6 +1534,106 @@ bool GVN::processLoad(LoadInst *L) {
   return false;
 }
 
+/// Return a pair the first field showing the value number of \p Exp and the
+/// second field showing whether it is a value number newly created.
+std::pair<uint32_t, bool>
+GVN::ValueTable::assignExpNewValueNum(Expression &Exp) {
+  uint32_t &e = expressionNumbering[Exp];
+  bool CreateNewValNum = !e;
+  if (CreateNewValNum) {
+    Expressions.push_back(Exp);
+    if (ExprIdx.size() < nextValueNumber + 1)
+      ExprIdx.resize(nextValueNumber * 2);
+    e = nextValueNumber;
+    ExprIdx[nextValueNumber++] = nextExprNumber++;
+  }
+  return {e, CreateNewValNum};
+}
+
+/// Return whether all the values related with the same \p num are
+/// defined in \p BB.
+bool GVN::ValueTable::areAllValsInBB(uint32_t Num, const BasicBlock *BB,
+                                     GVN &Gvn) {
+  LeaderTableEntry *Vals = &Gvn.LeaderTable[Num];
+  while (Vals && Vals->BB == BB)
+    Vals = Vals->Next;
+  return !Vals;
+}
+
+/// Wrap phiTranslateImpl to provide caching functionality.
+uint32_t GVN::ValueTable::phiTranslate(const BasicBlock *Pred,
+                                       const BasicBlock *PhiBlock, uint32_t Num,
+                                       GVN &Gvn) {
+  auto FindRes = PhiTranslateTable.find({Num, Pred});
+  if (FindRes != PhiTranslateTable.end())
+    return FindRes->second;
+  uint32_t NewNum = phiTranslateImpl(Pred, PhiBlock, Num, Gvn);
+  PhiTranslateTable.insert({{Num, Pred}, NewNum});
+  return NewNum;
+}
+
+/// Translate value number \p Num using phis, so that it has the values of
+/// the phis in BB.
+uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred,
+                                           const BasicBlock *PhiBlock,
+                                           uint32_t Num, GVN &Gvn) {
+  if (PHINode *PN = NumberingPhi[Num]) {
+    for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) {
+      if (PN->getParent() == PhiBlock && PN->getIncomingBlock(i) == Pred)
+        if (uint32_t TransVal = lookup(PN->getIncomingValue(i), false))
+          return TransVal;
+    }
+    return Num;
+  }
+
+  // If there is any value related with Num is defined in a BB other than
+  // PhiBlock, it cannot depend on a phi in PhiBlock without going through
+  // a backedge. We can do an early exit in that case to save compile time.
+  if (!areAllValsInBB(Num, PhiBlock, Gvn))
+    return Num;
+
+  if (Num >= ExprIdx.size() || ExprIdx[Num] == 0)
+    return Num;
+  Expression Exp = Expressions[ExprIdx[Num]];
+
+  for (unsigned i = 0; i < Exp.varargs.size(); i++) {
+    // For InsertValue and ExtractValue, some varargs are index numbers
+    // instead of value numbers. Those index numbers should not be
+    // translated.
+    if ((i > 1 && Exp.opcode == Instruction::InsertValue) ||
+        (i > 0 && Exp.opcode == Instruction::ExtractValue))
+      continue;
+    Exp.varargs[i] = phiTranslate(Pred, PhiBlock, Exp.varargs[i], Gvn);
+  }
+
+  if (Exp.commutative) {
+    assert(Exp.varargs.size() == 2 && "Unsupported commutative expression!");
+    if (Exp.varargs[0] > Exp.varargs[1]) {
+      std::swap(Exp.varargs[0], Exp.varargs[1]);
+      uint32_t Opcode = Exp.opcode >> 8;
+      if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)
+        Exp.opcode = (Opcode << 8) |
+                     CmpInst::getSwappedPredicate(
+                         static_cast<CmpInst::Predicate>(Exp.opcode & 255));
+    }
+  }
+
+  if (uint32_t NewNum = expressionNumbering[Exp])
+    return NewNum;
+  return Num;
+}
+
+/// Erase stale entry from phiTranslate cache so phiTranslate can be computed
+/// again.
+void GVN::ValueTable::eraseTranslateCacheEntry(uint32_t Num,
+                                               const BasicBlock &CurrBlock) {
+  for (const BasicBlock *Pred : predecessors(&CurrBlock)) {
+    auto FindRes = PhiTranslateTable.find({Num, Pred});
+    if (FindRes != PhiTranslateTable.end())
+      PhiTranslateTable.erase(FindRes);
+  }
+}
+
 // In order to find a leader for a given value number at a
 // specific basic block, we first obtain the list of all Values for that number,
 // and then scan the list to find one whose block dominates the block in
@@ -1496,6 +1678,13 @@ static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E,
   return Pred != nullptr;
 }
 
+void GVN::assignBlockRPONumber(Function &F) {
+  uint32_t NextBlockNumber = 1;
+  ReversePostOrderTraversal<Function *> RPOT(&F);
+  for (BasicBlock *BB : RPOT)
+    BlockRPONumber[BB] = NextBlockNumber++;
+}
+
 // Tries to replace instruction with const, using information from
 // ReplaceWithConstMap.
 bool GVN::replaceOperandsWithConsts(Instruction *Instr) const {
@@ -1827,6 +2016,8 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
   TLI = &RunTLI;
   VN.setAliasAnalysis(&RunAA);
   MD = RunMD;
+  OrderedInstructions OrderedInstrs(DT);
+  OI = &OrderedInstrs;
   VN.setMemDep(MD);
   ORE = RunORE;
 
@@ -1857,6 +2048,7 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
     // Fabricate val-num for dead-code in order to suppress assertion in
     // performPRE().
     assignValNumForDeadCode();
+    assignBlockRPONumber(F);
     bool PREChanged = true;
     while (PREChanged) {
       PREChanged = performPRE(F);
@@ -1908,14 +2100,26 @@ bool GVN::processBlock(BasicBlock *BB) {
     if (!AtStart)
       --BI;
 
-    for (SmallVectorImpl<Instruction *>::iterator I = InstrsToErase.begin(),
-         E = InstrsToErase.end(); I != E; ++I) {
-      DEBUG(dbgs() << "GVN removed: " << **I << '\n');
-      if (MD) MD->removeInstruction(*I);
-      DEBUG(verifyRemoved(*I));
-      (*I)->eraseFromParent();
+    bool InvalidateImplicitCF = false;
+    const Instruction *MaybeFirstICF = FirstImplicitControlFlowInsts.lookup(BB);
+    for (auto *I : InstrsToErase) {
+      assert(I->getParent() == BB && "Removing instruction from wrong block?");
+      DEBUG(dbgs() << "GVN removed: " << *I << '\n');
+      if (MD) MD->removeInstruction(I);
+      DEBUG(verifyRemoved(I));
+      if (MaybeFirstICF == I) {
+        // We have erased the first ICF in block. The map needs to be updated.
+        InvalidateImplicitCF = true;
+        // Do not keep dangling pointer on the erased instruction.
+        MaybeFirstICF = nullptr;
+      }
+      I->eraseFromParent();
     }
+
+    OI->invalidateBlock(BB);
     InstrsToErase.clear();
+    if (InvalidateImplicitCF)
+      fillImplicitControlFlowInfo(BB);
 
     if (AtStart)
       BI = BB->begin();
@@ -1928,7 +2132,7 @@ bool GVN::processBlock(BasicBlock *BB) {
 
 // Instantiate an expression in a predecessor that lacked it.
 bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
-                                    unsigned int ValNo) {
+                                    BasicBlock *Curr, unsigned int ValNo) {
   // Because we are going top-down through the block, all value numbers
   // will be available in the predecessor by the time we need them.  Any
   // that weren't originally present will have been instantiated earlier
@@ -1946,7 +2150,9 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
       success = false;
       break;
     }
-    if (Value *V = findLeader(Pred, VN.lookup(Op))) {
+    uint32_t TValNo =
+        VN.phiTranslate(Pred, Curr, VN.lookup(Op), *this);
+    if (Value *V = findLeader(Pred, TValNo)) {
       Instr->setOperand(i, V);
     } else {
       success = false;
@@ -1963,10 +2169,12 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
   Instr->insertBefore(Pred->getTerminator());
   Instr->setName(Instr->getName() + ".pre");
   Instr->setDebugLoc(Instr->getDebugLoc());
-  VN.add(Instr, ValNo);
+
+  unsigned Num = VN.lookupOrAdd(Instr);
+  VN.add(Instr, Num);
 
   // Update the availability map to include the new instruction.
-  addToLeaderTable(ValNo, Instr, Pred);
+  addToLeaderTable(Num, Instr, Pred);
   return true;
 }
 
@@ -2004,18 +2212,27 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
 
   SmallVector<std::pair<Value *, BasicBlock *>, 8> predMap;
   for (BasicBlock *P : predecessors(CurrentBlock)) {
-    // We're not interested in PRE where the block is its
-    // own predecessor, or in blocks with predecessors
-    // that are not reachable.
-    if (P == CurrentBlock) {
+    // We're not interested in PRE where blocks with predecessors that are
+    // not reachable.
+    if (!DT->isReachableFromEntry(P)) {
       NumWithout = 2;
       break;
-    } else if (!DT->isReachableFromEntry(P)) {
+    }
+    // It is not safe to do PRE when P->CurrentBlock is a loop backedge, and
+    // when CurInst has operand defined in CurrentBlock (so it may be defined
+    // by phi in the loop header).
+    if (BlockRPONumber[P] >= BlockRPONumber[CurrentBlock] &&
+        llvm::any_of(CurInst->operands(), [&](const Use &U) {
+          if (auto *Inst = dyn_cast<Instruction>(U.get()))
+            return Inst->getParent() == CurrentBlock;
+          return false;
+        })) {
       NumWithout = 2;
       break;
     }
 
-    Value *predV = findLeader(P, ValNo);
+    uint32_t TValNo = VN.phiTranslate(P, CurrentBlock, ValNo, *this);
+    Value *predV = findLeader(P, TValNo);
     if (!predV) {
       predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P));
       PREPred = P;
@@ -2041,6 +2258,20 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
   Instruction *PREInstr = nullptr;
 
   if (NumWithout != 0) {
+    if (!isSafeToSpeculativelyExecute(CurInst)) {
+      // It is only valid to insert a new instruction if the current instruction
+      // is always executed. An instruction with implicit control flow could
+      // prevent us from doing it. If we cannot speculate the execution, then
+      // PRE should be prohibited.
+      auto It = FirstImplicitControlFlowInsts.find(CurrentBlock);
+      if (It != FirstImplicitControlFlowInsts.end()) {
+        assert(It->second->getParent() == CurrentBlock &&
+               "Implicit control flow map broken?");
+        if (OI->dominates(It->second, CurInst))
+          return false;
+      }
+    }
+
     // Don't do PRE across indirect branch.
     if (isa<IndirectBrInst>(PREPred->getTerminator()))
       return false;
@@ -2055,7 +2286,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
     }
     // We need to insert somewhere, so let's give it a shot
     PREInstr = CurInst->clone();
-    if (!performScalarPREInsertion(PREInstr, PREPred, ValNo)) {
+    if (!performScalarPREInsertion(PREInstr, PREPred, CurrentBlock, ValNo)) {
       // If we failed insertion, make sure we remove the instruction.
       DEBUG(verifyRemoved(PREInstr));
       PREInstr->deleteValue();
@@ -2065,7 +2296,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
 
   // Either we should have filled in the PRE instruction, or we should
   // not have needed insertions.
-  assert (PREInstr != nullptr || NumWithout == 0);
+  assert(PREInstr != nullptr || NumWithout == 0);
 
   ++NumGVNPRE;
 
@@ -2074,13 +2305,19 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
       PHINode::Create(CurInst->getType(), predMap.size(),
                       CurInst->getName() + ".pre-phi", &CurrentBlock->front());
   for (unsigned i = 0, e = predMap.size(); i != e; ++i) {
-    if (Value *V = predMap[i].first)
+    if (Value *V = predMap[i].first) {
+      // If we use an existing value in this phi, we have to patch the original
+      // value because the phi will be used to replace a later value.
+      patchReplacementInstruction(CurInst, V);
       Phi->addIncoming(V, predMap[i].second);
-    else
+    } else
       Phi->addIncoming(PREInstr, PREPred);
   }
 
   VN.add(Phi, ValNo);
+  // After creating a new PHI for ValNo, the phi translate result for ValNo will
+  // be changed, so erase the related stale entries in phi translate cache.
+  VN.eraseTranslateCacheEntry(ValNo, *CurrentBlock);
   addToLeaderTable(ValNo, Phi, CurrentBlock);
   Phi->setDebugLoc(CurInst->getDebugLoc());
   CurInst->replaceAllUsesWith(Phi);
@@ -2093,7 +2330,14 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
   if (MD)
     MD->removeInstruction(CurInst);
   DEBUG(verifyRemoved(CurInst));
+  bool InvalidateImplicitCF =
+      FirstImplicitControlFlowInsts.lookup(CurInst->getParent()) == CurInst;
+  // FIXME: Intended to be markInstructionForDeletion(CurInst), but it causes
+  // some assertion failures.
+  OI->invalidateBlock(CurrentBlock);
   CurInst->eraseFromParent();
+  if (InvalidateImplicitCF)
+    fillImplicitControlFlowInfo(CurrentBlock);
   ++NumGVNInstr;
 
   return true;
@@ -2160,6 +2404,9 @@ bool GVN::iterateOnFunction(Function &F) {
   // RPOT walks the graph in its constructor and will not be invalidated during
   // processBlock.
   ReversePostOrderTraversal<Function *> RPOT(&F);
+
+  for (BasicBlock *BB : RPOT)
+    fillImplicitControlFlowInfo(BB);
   for (BasicBlock *BB : RPOT)
     Changed |= processBlock(BB);
 
@@ -2169,7 +2416,50 @@ bool GVN::iterateOnFunction(Function &F) {
 void GVN::cleanupGlobalSets() {
   VN.clear();
   LeaderTable.clear();
+  BlockRPONumber.clear();
   TableAllocator.Reset();
+  FirstImplicitControlFlowInsts.clear();
+}
+
+void
+GVN::fillImplicitControlFlowInfo(BasicBlock *BB) {
+  // Make sure that all marked instructions are actually deleted by this point,
+  // so that we don't need to care about omitting them.
+  assert(InstrsToErase.empty() && "Filling before removed all marked insns?");
+  auto MayNotTransferExecutionToSuccessor = [&](const Instruction *I) {
+    // If a block's instruction doesn't always pass the control to its successor
+    // instruction, mark the block as having implicit control flow. We use them
+    // to avoid wrong assumptions of sort "if A is executed and B post-dominates
+    // A, then B is also executed". This is not true is there is an implicit
+    // control flow instruction (e.g. a guard) between them.
+    //
+    // TODO: Currently, isGuaranteedToTransferExecutionToSuccessor returns false
+    // for volatile stores and loads because they can trap. The discussion on
+    // whether or not it is correct is still ongoing. We might want to get rid
+    // of this logic in the future. Anyways, trapping instructions shouldn't
+    // introduce implicit control flow, so we explicitly allow them here. This
+    // must be removed once isGuaranteedToTransferExecutionToSuccessor is fixed.
+    if (isGuaranteedToTransferExecutionToSuccessor(I))
+      return false;
+    if (isa<LoadInst>(I)) {
+      assert(cast<LoadInst>(I)->isVolatile() &&
+             "Non-volatile load should transfer execution to successor!");
+      return false;
+    }
+    if (isa<StoreInst>(I)) {
+      assert(cast<StoreInst>(I)->isVolatile() &&
+             "Non-volatile store should transfer execution to successor!");
+      return false;
+    }
+    return true;
+  };
+  FirstImplicitControlFlowInsts.erase(BB);
+
+  for (auto &I : *BB)
+    if (MayNotTransferExecutionToSuccessor(&I)) {
+      FirstImplicitControlFlowInsts[BB] = &I;
+      break;
+    }
 }
 
 /// Verify that the specified instruction does not occur in our
@@ -2317,6 +2607,7 @@ void GVN::assignValNumForDeadCode() {
 class llvm::gvn::GVNLegacyPass : public FunctionPass {
 public:
   static char ID; // Pass identification, replacement for typeid
+
   explicit GVNLegacyPass(bool NoLoads = false)
       : FunctionPass(ID), NoLoads(NoLoads) {
     initializeGVNLegacyPassPass(*PassRegistry::getPassRegistry());
@@ -2360,11 +2651,6 @@ private:
 
 char GVNLegacyPass::ID = 0;
 
-// The public interface to this file...
-FunctionPass *llvm::createGVNPass(bool NoLoads) {
-  return new GVNLegacyPass(NoLoads);
-}
-
 INITIALIZE_PASS_BEGIN(GVNLegacyPass, "gvn", "Global Value Numbering", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
@@ -2374,3 +2660,8 @@ INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 INITIALIZE_PASS_END(GVNLegacyPass, "gvn", "Global Value Numbering", false, false)
+
+// The public interface to this file...
+FunctionPass *llvm::createGVNPass(bool NoLoads) {
+  return new GVNLegacyPass(NoLoads);
+}
diff --git a/lib/Transforms/Scalar/GVNHoist.cpp b/lib/Transforms/Scalar/GVNHoist.cpp
index 29de792bd248..c0cd1ea74a74 100644
--- a/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/lib/Transforms/Scalar/GVNHoist.cpp
@@ -13,44 +13,72 @@
 // 1. To reduce the code size.
 // 2. In some cases reduce critical path (by exposing more ILP).
 //
+// The algorithm factors out the reachability of values such that multiple
+// queries to find reachability of values are fast. This is based on finding the
+// ANTIC points in the CFG which do not change during hoisting. The ANTIC points
+// are basically the dominance-frontiers in the inverse graph. So we introduce a
+// data structure (CHI nodes) to keep track of values flowing out of a basic
+// block. We only do this for values with multiple occurrences in the function
+// as they are the potential hoistable candidates. This approach allows us to
+// hoist instructions to a basic block with more than two successors, as well as
+// deal with infinite loops in a trivial way.
+//
+// Limitations: This pass does not hoist fully redundant expressions because
+// they are already handled by GVN-PRE. It is advisable to run gvn-hoist before
+// and after gvn-pre because gvn-pre creates opportunities for more instructions
+// to be hoisted.
+//
 // Hoisting may affect the performance in some cases. To mitigate that, hoisting
 // is disabled in the following cases.
 // 1. Scalars across calls.
 // 2. geps when corresponding load/store cannot be hoisted.
-//
-// TODO: Hoist from >2 successors. Currently GVNHoist will not hoist stores
-// in this case because it works on two instructions at a time.
-// entry:
-//   switch i32 %c1, label %exit1 [
-//     i32 0, label %sw0
-//     i32 1, label %sw1
-//   ]
-//
-// sw0:
-//   store i32 1, i32* @G
-//   br label %exit
-//
-// sw1:
-//   store i32 1, i32* @G
-//   br label %exit
-//
-// exit1:
-//   store i32 1, i32* @G
-//   ret void
-// exit:
-//   ret void
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <memory>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -69,6 +97,7 @@ static cl::opt<int>
     MaxHoistedThreshold("gvn-max-hoisted", cl::Hidden, cl::init(-1),
                         cl::desc("Max number of instructions to hoist "
                                  "(default unlimited = -1)"));
+
 static cl::opt<int> MaxNumberOfBBSInPath(
     "gvn-hoist-max-bbs", cl::Hidden, cl::init(4),
     cl::desc("Max number of basic blocks on the path between "
@@ -86,34 +115,50 @@ static cl::opt<int>
 
 namespace llvm {
 
-// Provides a sorting function based on the execution order of two instructions.
-struct SortByDFSIn {
-private:
-  DenseMap<const Value *, unsigned> &DFSNumber;
+using BBSideEffectsSet = DenseMap<const BasicBlock *, bool>;
+using SmallVecInsn = SmallVector<Instruction *, 4>;
+using SmallVecImplInsn = SmallVectorImpl<Instruction *>;
 
-public:
-  SortByDFSIn(DenseMap<const Value *, unsigned> &D) : DFSNumber(D) {}
+// Each element of a hoisting list contains the basic block where to hoist and
+// a list of instructions to be hoisted.
+using HoistingPointInfo = std::pair<BasicBlock *, SmallVecInsn>;
 
-  // Returns true when A executes before B.
-  bool operator()(const Instruction *A, const Instruction *B) const {
-    const BasicBlock *BA = A->getParent();
-    const BasicBlock *BB = B->getParent();
-    unsigned ADFS, BDFS;
-    if (BA == BB) {
-      ADFS = DFSNumber.lookup(A);
-      BDFS = DFSNumber.lookup(B);
-    } else {
-      ADFS = DFSNumber.lookup(BA);
-      BDFS = DFSNumber.lookup(BB);
-    }
-    assert(ADFS && BDFS);
-    return ADFS < BDFS;
-  }
-};
+using HoistingPointList = SmallVector<HoistingPointInfo, 4>;
 
 // A map from a pair of VNs to all the instructions with those VNs.
-typedef DenseMap<std::pair<unsigned, unsigned>, SmallVector<Instruction *, 4>>
-    VNtoInsns;
+using VNType = std::pair<unsigned, unsigned>;
+
+using VNtoInsns = DenseMap<VNType, SmallVector<Instruction *, 4>>;
+
+// CHI keeps information about values flowing out of a basic block.  It is
+// similar to PHI but in the inverse graph, and used for outgoing values on each
+// edge. For conciseness, it is computed only for instructions with multiple
+// occurrences in the CFG because they are the only hoistable candidates.
+//     A (CHI[{V, B, I1}, {V, C, I2}]
+//  /     \
+// /       \
+// B(I1)  C (I2)
+// The Value number for both I1 and I2 is V, the CHI node will save the
+// instruction as well as the edge where the value is flowing to.
+struct CHIArg {
+  VNType VN;
+
+  // Edge destination (shows the direction of flow), may not be where the I is.
+  BasicBlock *Dest;
+
+  // The instruction (VN) which uses the values flowing out of CHI.
+  Instruction *I;
+
+  bool operator==(const CHIArg &A) { return VN == A.VN; }
+  bool operator!=(const CHIArg &A) { return !(*this == A); }
+};
+
+using CHIIt = SmallVectorImpl<CHIArg>::iterator;
+using CHIArgs = iterator_range<CHIIt>;
+using OutValuesType = DenseMap<BasicBlock *, SmallVector<CHIArg, 2>>;
+using InValuesType =
+    DenseMap<BasicBlock *, SmallVector<std::pair<VNType, Instruction *>, 2>>;
+
 // An invalid value number Used when inserting a single value number into
 // VNtoInsns.
 enum : unsigned { InvalidVN = ~2U };
@@ -192,16 +237,10 @@ public:
   }
 
   const VNtoInsns &getScalarVNTable() const { return VNtoCallsScalars; }
-
   const VNtoInsns &getLoadVNTable() const { return VNtoCallsLoads; }
-
   const VNtoInsns &getStoreVNTable() const { return VNtoCallsStores; }
 };
 
-typedef DenseMap<const BasicBlock *, bool> BBSideEffectsSet;
-typedef SmallVector<Instruction *, 4> SmallVecInsn;
-typedef SmallVectorImpl<Instruction *> SmallVecImplInsn;
-
 static void combineKnownMetadata(Instruction *ReplInst, Instruction *I) {
   static const unsigned KnownIDs[] = {
       LLVMContext::MD_tbaa,           LLVMContext::MD_alias_scope,
@@ -216,15 +255,13 @@ static void combineKnownMetadata(Instruction *ReplInst, Instruction *I) {
 // cases reduce critical path (by exposing more ILP).
 class GVNHoist {
 public:
-  GVNHoist(DominatorTree *DT, AliasAnalysis *AA, MemoryDependenceResults *MD,
-           MemorySSA *MSSA)
-      : DT(DT), AA(AA), MD(MD), MSSA(MSSA),
-        MSSAUpdater(make_unique<MemorySSAUpdater>(MSSA)),
-        HoistingGeps(false),
-        HoistedCtr(0)
-  { }
+  GVNHoist(DominatorTree *DT, PostDominatorTree *PDT, AliasAnalysis *AA,
+           MemoryDependenceResults *MD, MemorySSA *MSSA)
+      : DT(DT), PDT(PDT), AA(AA), MD(MD), MSSA(MSSA),
+        MSSAUpdater(llvm::make_unique<MemorySSAUpdater>(MSSA)) {}
 
   bool run(Function &F) {
+    NumFuncArgs = F.arg_size();
     VN.setDomTree(DT);
     VN.setAliasAnalysis(AA);
     VN.setMemDep(MD);
@@ -241,7 +278,7 @@ public:
     int ChainLength = 0;
 
     // FIXME: use lazy evaluation of VN to avoid the fix-point computation.
-    while (1) {
+    while (true) {
       if (MaxChainLength != -1 && ++ChainLength >= MaxChainLength)
         return Res;
 
@@ -261,18 +298,48 @@ public:
     return Res;
   }
 
+  // Copied from NewGVN.cpp
+  // This function provides global ranking of operations so that we can place
+  // them in a canonical order.  Note that rank alone is not necessarily enough
+  // for a complete ordering, as constants all have the same rank.  However,
+  // generally, we will simplify an operation with all constants so that it
+  // doesn't matter what order they appear in.
+  unsigned int rank(const Value *V) const {
+    // Prefer constants to undef to anything else
+    // Undef is a constant, have to check it first.
+    // Prefer smaller constants to constantexprs
+    if (isa<ConstantExpr>(V))
+      return 2;
+    if (isa<UndefValue>(V))
+      return 1;
+    if (isa<Constant>(V))
+      return 0;
+    else if (auto *A = dyn_cast<Argument>(V))
+      return 3 + A->getArgNo();
+
+    // Need to shift the instruction DFS by number of arguments + 3 to account
+    // for the constant and argument ranking above.
+    auto Result = DFSNumber.lookup(V);
+    if (Result > 0)
+      return 4 + NumFuncArgs + Result;
+    // Unreachable or something else, just return a really large number.
+    return ~0;
+  }
+
 private:
   GVN::ValueTable VN;
   DominatorTree *DT;
+  PostDominatorTree *PDT;
   AliasAnalysis *AA;
   MemoryDependenceResults *MD;
   MemorySSA *MSSA;
   std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
-  const bool HoistingGeps;
   DenseMap<const Value *, unsigned> DFSNumber;
   BBSideEffectsSet BBSideEffects;
-  DenseSet<const BasicBlock*> HoistBarrier;
-  int HoistedCtr;
+  DenseSet<const BasicBlock *> HoistBarrier;
+  SmallVector<BasicBlock *, 32> IDFBlocks;
+  unsigned NumFuncArgs;
+  const bool HoistingGeps = false;
 
   enum InsKind { Unknown, Scalar, Load, Store };
 
@@ -305,45 +372,7 @@ private:
     return false;
   }
 
-  // Return true when all paths from HoistBB to the end of the function pass
-  // through one of the blocks in WL.
-  bool hoistingFromAllPaths(const BasicBlock *HoistBB,
-                            SmallPtrSetImpl<const BasicBlock *> &WL) {
-
-    // Copy WL as the loop will remove elements from it.
-    SmallPtrSet<const BasicBlock *, 2> WorkList(WL.begin(), WL.end());
-
-    for (auto It = df_begin(HoistBB), E = df_end(HoistBB); It != E;) {
-      // There exists a path from HoistBB to the exit of the function if we are
-      // still iterating in DF traversal and we removed all instructions from
-      // the work list.
-      if (WorkList.empty())
-        return false;
-
-      const BasicBlock *BB = *It;
-      if (WorkList.erase(BB)) {
-        // Stop DFS traversal when BB is in the work list.
-        It.skipChildren();
-        continue;
-      }
-
-      // We reached the leaf Basic Block => not all paths have this instruction.
-      if (!BB->getTerminator()->getNumSuccessors())
-        return false;
-
-      // When reaching the back-edge of a loop, there may be a path through the
-      // loop that does not pass through B or C before exiting the loop.
-      if (successorDominate(BB, HoistBB))
-        return false;
-
-      // Increment DFS traversal when not skipping children.
-      ++It;
-    }
-
-    return true;
-  }
-
-  /* Return true when I1 appears before I2 in the instructions of BB.  */
+  // Return true when I1 appears before I2 in the instructions of BB.
   bool firstInBB(const Instruction *I1, const Instruction *I2) {
     assert(I1->getParent() == I2->getParent());
     unsigned I1DFS = DFSNumber.lookup(I1);
@@ -387,6 +416,25 @@ private:
     return false;
   }
 
+  bool hasEHhelper(const BasicBlock *BB, const BasicBlock *SrcBB,
+                   int &NBBsOnAllPaths) {
+    // Stop walk once the limit is reached.
+    if (NBBsOnAllPaths == 0)
+      return true;
+
+    // Impossible to hoist with exceptions on the path.
+    if (hasEH(BB))
+      return true;
+
+    // No such instruction after HoistBarrier in a basic block was
+    // selected for hoisting so instructions selected within basic block with
+    // a hoist barrier can be hoisted.
+    if ((BB != SrcBB) && HoistBarrier.count(BB))
+      return true;
+
+    return false;
+  }
+
   // Return true when there are exception handling or loads of memory Def
   // between Def and NewPt.  This function is only called for stores: Def is
   // the MemoryDef of the store to be hoisted.
@@ -414,18 +462,7 @@ private:
         continue;
       }
 
-      // Stop walk once the limit is reached.
-      if (NBBsOnAllPaths == 0)
-        return true;
-
-      // Impossible to hoist with exceptions on the path.
-      if (hasEH(BB))
-        return true;
-
-      // No such instruction after HoistBarrier in a basic block was
-      // selected for hoisting so instructions selected within basic block with
-      // a hoist barrier can be hoisted.
-      if ((BB != OldBB) && HoistBarrier.count(BB))
+      if (hasEHhelper(BB, OldBB, NBBsOnAllPaths))
         return true;
 
       // Check that we do not move a store past loads.
@@ -463,18 +500,7 @@ private:
         continue;
       }
 
-      // Stop walk once the limit is reached.
-      if (NBBsOnAllPaths == 0)
-        return true;
-
-      // Impossible to hoist with exceptions on the path.
-      if (hasEH(BB))
-        return true;
-
-      // No such instruction after HoistBarrier in a basic block was
-      // selected for hoisting so instructions selected within basic block with
-      // a hoist barrier can be hoisted.
-      if ((BB != SrcBB) && HoistBarrier.count(BB))
+      if (hasEHhelper(BB, SrcBB, NBBsOnAllPaths))
         return true;
 
       // -1 is unlimited number of blocks on all paths.
@@ -491,7 +517,6 @@ private:
   // to NewPt.
   bool safeToHoistLdSt(const Instruction *NewPt, const Instruction *OldPt,
                        MemoryUseOrDef *U, InsKind K, int &NBBsOnAllPaths) {
-
     // In place hoisting is safe.
     if (NewPt == OldPt)
       return true;
@@ -533,141 +558,258 @@ private:
 
   // Return true when it is safe to hoist scalar instructions from all blocks in
   // WL to HoistBB.
-  bool safeToHoistScalar(const BasicBlock *HoistBB,
-                         SmallPtrSetImpl<const BasicBlock *> &WL,
+  bool safeToHoistScalar(const BasicBlock *HoistBB, const BasicBlock *BB,
                          int &NBBsOnAllPaths) {
-    // Check that the hoisted expression is needed on all paths.
-    if (!hoistingFromAllPaths(HoistBB, WL))
-      return false;
+    return !hasEHOnPath(HoistBB, BB, NBBsOnAllPaths);
+  }
 
-    for (const BasicBlock *BB : WL)
-      if (hasEHOnPath(HoistBB, BB, NBBsOnAllPaths))
-        return false;
+  // In the inverse CFG, the dominance frontier of basic block (BB) is the
+  // point where ANTIC needs to be computed for instructions which are going
+  // to be hoisted. Since this point does not change during gvn-hoist,
+  // we compute it only once (on demand).
+  // The ides is inspired from:
+  // "Partial Redundancy Elimination in SSA Form"
+  // ROBERT KENNEDY, SUN CHAN, SHIN-MING LIU, RAYMOND LO, PENG TU and FRED CHOW
+  // They use similar idea in the forward graph to to find fully redundant and
+  // partially redundant expressions, here it is used in the inverse graph to
+  // find fully anticipable instructions at merge point (post-dominator in
+  // the inverse CFG).
+  // Returns the edge via which an instruction in BB will get the values from.
+
+  // Returns true when the values are flowing out to each edge.
+  bool valueAnticipable(CHIArgs C, TerminatorInst *TI) const {
+    if (TI->getNumSuccessors() > (unsigned)std::distance(C.begin(), C.end()))
+      return false; // Not enough args in this CHI.
 
+    for (auto CHI : C) {
+      BasicBlock *Dest = CHI.Dest;
+      // Find if all the edges have values flowing out of BB.
+      bool Found = llvm::any_of(TI->successors(), [Dest](const BasicBlock *BB) {
+          return BB == Dest; });
+      if (!Found)
+        return false;
+    }
     return true;
   }
 
-  // Each element of a hoisting list contains the basic block where to hoist and
-  // a list of instructions to be hoisted.
-  typedef std::pair<BasicBlock *, SmallVecInsn> HoistingPointInfo;
-  typedef SmallVector<HoistingPointInfo, 4> HoistingPointList;
+  // Check if it is safe to hoist values tracked by CHI in the range
+  // [Begin, End) and accumulate them in Safe.
+  void checkSafety(CHIArgs C, BasicBlock *BB, InsKind K,
+                   SmallVectorImpl<CHIArg> &Safe) {
+    int NumBBsOnAllPaths = MaxNumberOfBBSInPath;
+    for (auto CHI : C) {
+      Instruction *Insn = CHI.I;
+      if (!Insn) // No instruction was inserted in this CHI.
+        continue;
+      if (K == InsKind::Scalar) {
+        if (safeToHoistScalar(BB, Insn->getParent(), NumBBsOnAllPaths))
+          Safe.push_back(CHI);
+      } else {
+        MemoryUseOrDef *UD = MSSA->getMemoryAccess(Insn);
+        if (safeToHoistLdSt(BB->getTerminator(), Insn, UD, K, NumBBsOnAllPaths))
+          Safe.push_back(CHI);
+      }
+    }
+  }
+
+  using RenameStackType = DenseMap<VNType, SmallVector<Instruction *, 2>>;
 
-  // Partition InstructionsToHoist into a set of candidates which can share a
-  // common hoisting point. The partitions are collected in HPL. IsScalar is
-  // true when the instructions in InstructionsToHoist are scalars. IsLoad is
-  // true when the InstructionsToHoist are loads, false when they are stores.
-  void partitionCandidates(SmallVecImplInsn &InstructionsToHoist,
-                           HoistingPointList &HPL, InsKind K) {
-    // No need to sort for two instructions.
-    if (InstructionsToHoist.size() > 2) {
-      SortByDFSIn Pred(DFSNumber);
-      std::sort(InstructionsToHoist.begin(), InstructionsToHoist.end(), Pred);
+  // Push all the VNs corresponding to BB into RenameStack.
+  void fillRenameStack(BasicBlock *BB, InValuesType &ValueBBs,
+                       RenameStackType &RenameStack) {
+    auto it1 = ValueBBs.find(BB);
+    if (it1 != ValueBBs.end()) {
+      // Iterate in reverse order to keep lower ranked values on the top.
+      for (std::pair<VNType, Instruction *> &VI : reverse(it1->second)) {
+        // Get the value of instruction I
+        DEBUG(dbgs() << "\nPushing on stack: " << *VI.second);
+        RenameStack[VI.first].push_back(VI.second);
+      }
     }
+  }
 
-    int NumBBsOnAllPaths = MaxNumberOfBBSInPath;
+  void fillChiArgs(BasicBlock *BB, OutValuesType &CHIBBs,
+                   RenameStackType &RenameStack) {
+    // For each *predecessor* (because Post-DOM) of BB check if it has a CHI
+    for (auto Pred : predecessors(BB)) {
+      auto P = CHIBBs.find(Pred);
+      if (P == CHIBBs.end()) {
+        continue;
+      }
+      DEBUG(dbgs() << "\nLooking at CHIs in: " << Pred->getName(););
+      // A CHI is found (BB -> Pred is an edge in the CFG)
+      // Pop the stack until Top(V) = Ve.
+      auto &VCHI = P->second;
+      for (auto It = VCHI.begin(), E = VCHI.end(); It != E;) {
+        CHIArg &C = *It;
+        if (!C.Dest) {
+          auto si = RenameStack.find(C.VN);
+          // The Basic Block where CHI is must dominate the value we want to
+          // track in a CHI. In the PDom walk, there can be values in the
+          // stack which are not control dependent e.g., nested loop.
+          if (si != RenameStack.end() && si->second.size() &&
+              DT->dominates(Pred, si->second.back()->getParent())) {
+            C.Dest = BB;                     // Assign the edge
+            C.I = si->second.pop_back_val(); // Assign the argument
+            DEBUG(dbgs() << "\nCHI Inserted in BB: " << C.Dest->getName()
+                         << *C.I << ", VN: " << C.VN.first << ", "
+                         << C.VN.second);
+          }
+          // Move to next CHI of a different value
+          It = std::find_if(It, VCHI.end(),
+                            [It](CHIArg &A) { return A != *It; });
+        } else
+          ++It;
+      }
+    }
+  }
 
-    SmallVecImplInsn::iterator II = InstructionsToHoist.begin();
-    SmallVecImplInsn::iterator Start = II;
-    Instruction *HoistPt = *II;
-    BasicBlock *HoistBB = HoistPt->getParent();
-    MemoryUseOrDef *UD;
-    if (K != InsKind::Scalar)
-      UD = MSSA->getMemoryAccess(HoistPt);
+  // Walk the post-dominator tree top-down and use a stack for each value to
+  // store the last value you see. When you hit a CHI from a given edge, the
+  // value to use as the argument is at the top of the stack, add the value to
+  // CHI and pop.
+  void insertCHI(InValuesType &ValueBBs, OutValuesType &CHIBBs) {
+    auto Root = PDT->getNode(nullptr);
+    if (!Root)
+      return;
+    // Depth first walk on PDom tree to fill the CHIargs at each PDF.
+    RenameStackType RenameStack;
+    for (auto Node : depth_first(Root)) {
+      BasicBlock *BB = Node->getBlock();
+      if (!BB)
+        continue;
 
-    for (++II; II != InstructionsToHoist.end(); ++II) {
-      Instruction *Insn = *II;
-      BasicBlock *BB = Insn->getParent();
-      BasicBlock *NewHoistBB;
-      Instruction *NewHoistPt;
+      // Collect all values in BB and push to stack.
+      fillRenameStack(BB, ValueBBs, RenameStack);
 
-      if (BB == HoistBB) { // Both are in the same Basic Block.
-        NewHoistBB = HoistBB;
-        NewHoistPt = firstInBB(Insn, HoistPt) ? Insn : HoistPt;
-      } else {
-        // If the hoisting point contains one of the instructions,
-        // then hoist there, otherwise hoist before the terminator.
-        NewHoistBB = DT->findNearestCommonDominator(HoistBB, BB);
-        if (NewHoistBB == BB)
-          NewHoistPt = Insn;
-        else if (NewHoistBB == HoistBB)
-          NewHoistPt = HoistPt;
-        else
-          NewHoistPt = NewHoistBB->getTerminator();
-      }
+      // Fill outgoing values in each CHI corresponding to BB.
+      fillChiArgs(BB, CHIBBs, RenameStack);
+    }
+  }
 
-      SmallPtrSet<const BasicBlock *, 2> WL;
-      WL.insert(HoistBB);
-      WL.insert(BB);
+  // Walk all the CHI-nodes to find ones which have a empty-entry and remove
+  // them Then collect all the instructions which are safe to hoist and see if
+  // they form a list of anticipable values. OutValues contains CHIs
+  // corresponding to each basic block.
+  void findHoistableCandidates(OutValuesType &CHIBBs, InsKind K,
+                               HoistingPointList &HPL) {
+    auto cmpVN = [](const CHIArg &A, const CHIArg &B) { return A.VN < B.VN; };
 
-      if (K == InsKind::Scalar) {
-        if (safeToHoistScalar(NewHoistBB, WL, NumBBsOnAllPaths)) {
-          // Extend HoistPt to NewHoistPt.
-          HoistPt = NewHoistPt;
-          HoistBB = NewHoistBB;
-          continue;
-        }
-      } else {
-        // When NewBB already contains an instruction to be hoisted, the
-        // expression is needed on all paths.
-        // Check that the hoisted expression is needed on all paths: it is
-        // unsafe to hoist loads to a place where there may be a path not
-        // loading from the same address: for instance there may be a branch on
-        // which the address of the load may not be initialized.
-        if ((HoistBB == NewHoistBB || BB == NewHoistBB ||
-             hoistingFromAllPaths(NewHoistBB, WL)) &&
-            // Also check that it is safe to move the load or store from HoistPt
-            // to NewHoistPt, and from Insn to NewHoistPt.
-            safeToHoistLdSt(NewHoistPt, HoistPt, UD, K, NumBBsOnAllPaths) &&
-            safeToHoistLdSt(NewHoistPt, Insn, MSSA->getMemoryAccess(Insn),
-                            K, NumBBsOnAllPaths)) {
-          // Extend HoistPt to NewHoistPt.
-          HoistPt = NewHoistPt;
-          HoistBB = NewHoistBB;
-          continue;
-        }
-      }
+    // CHIArgs now have the outgoing values, so check for anticipability and
+    // accumulate hoistable candidates in HPL.
+    for (std::pair<BasicBlock *, SmallVector<CHIArg, 2>> &A : CHIBBs) {
+      BasicBlock *BB = A.first;
+      SmallVectorImpl<CHIArg> &CHIs = A.second;
+      // Vector of PHIs contains PHIs for different instructions.
+      // Sort the args according to their VNs, such that identical
+      // instructions are together.
+      std::stable_sort(CHIs.begin(), CHIs.end(), cmpVN);
+      auto TI = BB->getTerminator();
+      auto B = CHIs.begin();
+      // [PreIt, PHIIt) form a range of CHIs which have identical VNs.
+      auto PHIIt = std::find_if(CHIs.begin(), CHIs.end(),
+                                 [B](CHIArg &A) { return A != *B; });
+      auto PrevIt = CHIs.begin();
+      while (PrevIt != PHIIt) {
+        // Collect values which satisfy safety checks.
+        SmallVector<CHIArg, 2> Safe;
+        // We check for safety first because there might be multiple values in
+        // the same path, some of which are not safe to be hoisted, but overall
+        // each edge has at least one value which can be hoisted, making the
+        // value anticipable along that path.
+        checkSafety(make_range(PrevIt, PHIIt), BB, K, Safe);
 
-      // At this point it is not safe to extend the current hoisting to
-      // NewHoistPt: save the hoisting list so far.
-      if (std::distance(Start, II) > 1)
-        HPL.push_back({HoistBB, SmallVecInsn(Start, II)});
+        // List of safe values should be anticipable at TI.
+        if (valueAnticipable(make_range(Safe.begin(), Safe.end()), TI)) {
+          HPL.push_back({BB, SmallVecInsn()});
+          SmallVecInsn &V = HPL.back().second;
+          for (auto B : Safe)
+            V.push_back(B.I);
+        }
 
-      // Start over from BB.
-      Start = II;
-      if (K != InsKind::Scalar)
-        UD = MSSA->getMemoryAccess(*Start);
-      HoistPt = Insn;
-      HoistBB = BB;
-      NumBBsOnAllPaths = MaxNumberOfBBSInPath;
+        // Check other VNs
+        PrevIt = PHIIt;
+        PHIIt = std::find_if(PrevIt, CHIs.end(),
+                             [PrevIt](CHIArg &A) { return A != *PrevIt; });
+      }
     }
-
-    // Save the last partition.
-    if (std::distance(Start, II) > 1)
-      HPL.push_back({HoistBB, SmallVecInsn(Start, II)});
   }
 
-  // Initialize HPL from Map.
+  // Compute insertion points for each values which can be fully anticipated at
+  // a dominator. HPL contains all such values.
   void computeInsertionPoints(const VNtoInsns &Map, HoistingPointList &HPL,
                               InsKind K) {
+    // Sort VNs based on their rankings
+    std::vector<VNType> Ranks;
     for (const auto &Entry : Map) {
-      if (MaxHoistedThreshold != -1 && ++HoistedCtr > MaxHoistedThreshold)
-        return;
+      Ranks.push_back(Entry.first);
+    }
+
+    // TODO: Remove fully-redundant expressions.
+    // Get instruction from the Map, assume that all the Instructions
+    // with same VNs have same rank (this is an approximation).
+    std::sort(Ranks.begin(), Ranks.end(),
+              [this, &Map](const VNType &r1, const VNType &r2) {
+                return (rank(*Map.lookup(r1).begin()) <
+                        rank(*Map.lookup(r2).begin()));
+              });
 
-      const SmallVecInsn &V = Entry.second;
+    // - Sort VNs according to their rank, and start with lowest ranked VN
+    // - Take a VN and for each instruction with same VN
+    //   - Find the dominance frontier in the inverse graph (PDF)
+    //   - Insert the chi-node at PDF
+    // - Remove the chi-nodes with missing entries
+    // - Remove values from CHI-nodes which do not truly flow out, e.g.,
+    //   modified along the path.
+    // - Collect the remaining values that are still anticipable
+    SmallVector<BasicBlock *, 2> IDFBlocks;
+    ReverseIDFCalculator IDFs(*PDT);
+    OutValuesType OutValue;
+    InValuesType InValue;
+    for (const auto &R : Ranks) {
+      const SmallVecInsn &V = Map.lookup(R);
       if (V.size() < 2)
         continue;
+      const VNType &VN = R;
+      SmallPtrSet<BasicBlock *, 2> VNBlocks;
+      for (auto &I : V) {
+        BasicBlock *BBI = I->getParent();
+        if (!hasEH(BBI))
+          VNBlocks.insert(BBI);
+      }
+      // Compute the Post Dominance Frontiers of each basic block
+      // The dominance frontier of a live block X in the reverse
+      // control graph is the set of blocks upon which X is control
+      // dependent. The following sequence computes the set of blocks
+      // which currently have dead terminators that are control
+      // dependence sources of a block which is in NewLiveBlocks.
+      IDFs.setDefiningBlocks(VNBlocks);
+      IDFs.calculate(IDFBlocks);
 
-      // Compute the insertion point and the list of expressions to be hoisted.
-      SmallVecInsn InstructionsToHoist;
-      for (auto I : V)
-        // We don't need to check for hoist-barriers here because if
-        // I->getParent() is a barrier then I precedes the barrier.
-        if (!hasEH(I->getParent()))
-          InstructionsToHoist.push_back(I);
-
-      if (!InstructionsToHoist.empty())
-        partitionCandidates(InstructionsToHoist, HPL, K);
+      // Make a map of BB vs instructions to be hoisted.
+      for (unsigned i = 0; i < V.size(); ++i) {
+        InValue[V[i]->getParent()].push_back(std::make_pair(VN, V[i]));
+      }
+      // Insert empty CHI node for this VN. This is used to factor out
+      // basic blocks where the ANTIC can potentially change.
+      for (auto IDFB : IDFBlocks) { // TODO: Prune out useless CHI insertions.
+        for (unsigned i = 0; i < V.size(); ++i) {
+          CHIArg C = {VN, nullptr, nullptr};
+           // Ignore spurious PDFs.
+          if (DT->properlyDominates(IDFB, V[i]->getParent())) {
+            OutValue[IDFB].push_back(C);
+            DEBUG(dbgs() << "\nInsertion a CHI for BB: " << IDFB->getName()
+                         << ", for Insn: " << *V[i]);
+          }
+        }
+      }
     }
+
+    // Insert CHI args at each PDF to iterate on factored graph of
+    // control dependence.
+    insertCHI(InValue, OutValue);
+    // Using the CHI args inserted at each PDF, find fully anticipable values.
+    findHoistableCandidates(OutValue, K, HPL);
   }
 
   // Return true when all operands of Instr are available at insertion point
@@ -714,7 +856,6 @@ private:
     Instruction *ClonedGep = Gep->clone();
     for (unsigned i = 0, e = Gep->getNumOperands(); i != e; ++i)
       if (Instruction *Op = dyn_cast<Instruction>(Gep->getOperand(i))) {
-
         // Check whether the operand is already available.
         if (DT->dominates(Op->getParent(), HoistPt))
           continue;
@@ -748,6 +889,88 @@ private:
     Repl->replaceUsesOfWith(Gep, ClonedGep);
   }
 
+  void updateAlignment(Instruction *I, Instruction *Repl) {
+    if (auto *ReplacementLoad = dyn_cast<LoadInst>(Repl)) {
+      ReplacementLoad->setAlignment(
+          std::min(ReplacementLoad->getAlignment(),
+                   cast<LoadInst>(I)->getAlignment()));
+      ++NumLoadsRemoved;
+    } else if (auto *ReplacementStore = dyn_cast<StoreInst>(Repl)) {
+      ReplacementStore->setAlignment(
+          std::min(ReplacementStore->getAlignment(),
+                   cast<StoreInst>(I)->getAlignment()));
+      ++NumStoresRemoved;
+    } else if (auto *ReplacementAlloca = dyn_cast<AllocaInst>(Repl)) {
+      ReplacementAlloca->setAlignment(
+          std::max(ReplacementAlloca->getAlignment(),
+                   cast<AllocaInst>(I)->getAlignment()));
+    } else if (isa<CallInst>(Repl)) {
+      ++NumCallsRemoved;
+    }
+  }
+
+  // Remove all the instructions in Candidates and replace their usage with Repl.
+  // Returns the number of instructions removed.
+  unsigned rauw(const SmallVecInsn &Candidates, Instruction *Repl,
+                MemoryUseOrDef *NewMemAcc) {
+    unsigned NR = 0;
+    for (Instruction *I : Candidates) {
+      if (I != Repl) {
+        ++NR;
+        updateAlignment(I, Repl);
+        if (NewMemAcc) {
+          // Update the uses of the old MSSA access with NewMemAcc.
+          MemoryAccess *OldMA = MSSA->getMemoryAccess(I);
+          OldMA->replaceAllUsesWith(NewMemAcc);
+          MSSAUpdater->removeMemoryAccess(OldMA);
+        }
+
+        Repl->andIRFlags(I);
+        combineKnownMetadata(Repl, I);
+        I->replaceAllUsesWith(Repl);
+        // Also invalidate the Alias Analysis cache.
+        MD->removeInstruction(I);
+        I->eraseFromParent();
+      }
+    }
+    return NR;
+  }
+
+  // Replace all Memory PHI usage with NewMemAcc.
+  void raMPHIuw(MemoryUseOrDef *NewMemAcc) {
+    SmallPtrSet<MemoryPhi *, 4> UsePhis;
+    for (User *U : NewMemAcc->users())
+      if (MemoryPhi *Phi = dyn_cast<MemoryPhi>(U))
+        UsePhis.insert(Phi);
+
+    for (MemoryPhi *Phi : UsePhis) {
+      auto In = Phi->incoming_values();
+      if (llvm::all_of(In, [&](Use &U) { return U == NewMemAcc; })) {
+        Phi->replaceAllUsesWith(NewMemAcc);
+        MSSAUpdater->removeMemoryAccess(Phi);
+      }
+    }
+  }
+
+  // Remove all other instructions and replace them with Repl.
+  unsigned removeAndReplace(const SmallVecInsn &Candidates, Instruction *Repl,
+                            BasicBlock *DestBB, bool MoveAccess) {
+    MemoryUseOrDef *NewMemAcc = MSSA->getMemoryAccess(Repl);
+    if (MoveAccess && NewMemAcc) {
+        // The definition of this ld/st will not change: ld/st hoisting is
+        // legal when the ld/st is not moved past its current definition.
+        MSSAUpdater->moveToPlace(NewMemAcc, DestBB, MemorySSA::End);
+    }
+
+    // Replace all other instructions with Repl with memory access NewMemAcc.
+    unsigned NR = rauw(Candidates, Repl, NewMemAcc);
+
+    // Remove MemorySSA phi nodes with the same arguments.
+    if (NewMemAcc)
+      raMPHIuw(NewMemAcc);
+    return NR;
+  }
+
   // In the case Repl is a load or a store, we make all their GEPs
   // available: GEPs are not hoisted by default to avoid the address
   // computations to be hoisted without the associated load or store.
@@ -789,11 +1012,11 @@ private:
     for (const HoistingPointInfo &HP : HPL) {
       // Find out whether we already have one of the instructions in HoistPt,
       // in which case we do not have to move it.
-      BasicBlock *HoistPt = HP.first;
+      BasicBlock *DestBB = HP.first;
       const SmallVecInsn &InstructionsToHoist = HP.second;
       Instruction *Repl = nullptr;
       for (Instruction *I : InstructionsToHoist)
-        if (I->getParent() == HoistPt)
+        if (I->getParent() == DestBB)
           // If there are two instructions in HoistPt to be hoisted in place:
           // update Repl to be the first one, such that we can rename the uses
           // of the second based on the first.
@@ -805,7 +1028,7 @@ private:
       bool MoveAccess = true;
       if (Repl) {
         // Repl is already in HoistPt: it remains in place.
-        assert(allOperandsAvailable(Repl, HoistPt) &&
+        assert(allOperandsAvailable(Repl, DestBB) &&
                "instruction depends on operands that are not available");
         MoveAccess = false;
       } else {
@@ -816,40 +1039,26 @@ private:
         // We can move Repl in HoistPt only when all operands are available.
         // The order in which hoistings are done may influence the availability
         // of operands.
-        if (!allOperandsAvailable(Repl, HoistPt)) {
-
+        if (!allOperandsAvailable(Repl, DestBB)) {
           // When HoistingGeps there is nothing more we can do to make the
           // operands available: just continue.
           if (HoistingGeps)
             continue;
 
           // When not HoistingGeps we need to copy the GEPs.
-          if (!makeGepOperandsAvailable(Repl, HoistPt, InstructionsToHoist))
+          if (!makeGepOperandsAvailable(Repl, DestBB, InstructionsToHoist))
             continue;
         }
 
         // Move the instruction at the end of HoistPt.
-        Instruction *Last = HoistPt->getTerminator();
+        Instruction *Last = DestBB->getTerminator();
         MD->removeInstruction(Repl);
         Repl->moveBefore(Last);
 
         DFSNumber[Repl] = DFSNumber[Last]++;
       }
 
-      MemoryAccess *NewMemAcc = MSSA->getMemoryAccess(Repl);
-
-      if (MoveAccess) {
-        if (MemoryUseOrDef *OldMemAcc =
-                dyn_cast_or_null<MemoryUseOrDef>(NewMemAcc)) {
-          // The definition of this ld/st will not change: ld/st hoisting is
-          // legal when the ld/st is not moved past its current definition.
-          MemoryAccess *Def = OldMemAcc->getDefiningAccess();
-          NewMemAcc =
-            MSSAUpdater->createMemoryAccessInBB(Repl, Def, HoistPt, MemorySSA::End);
-          OldMemAcc->replaceAllUsesWith(NewMemAcc);
-          MSSAUpdater->removeMemoryAccess(OldMemAcc);
-        }
-      }
+      NR += removeAndReplace(InstructionsToHoist, Repl, DestBB, MoveAccess);
 
       if (isa<LoadInst>(Repl))
         ++NL;
@@ -859,59 +1068,6 @@ private:
         ++NC;
       else // Scalar
         ++NI;
-
-      // Remove and rename all other instructions.
-      for (Instruction *I : InstructionsToHoist)
-        if (I != Repl) {
-          ++NR;
-          if (auto *ReplacementLoad = dyn_cast<LoadInst>(Repl)) {
-            ReplacementLoad->setAlignment(
-                std::min(ReplacementLoad->getAlignment(),
-                         cast<LoadInst>(I)->getAlignment()));
-            ++NumLoadsRemoved;
-          } else if (auto *ReplacementStore = dyn_cast<StoreInst>(Repl)) {
-            ReplacementStore->setAlignment(
-                std::min(ReplacementStore->getAlignment(),
-                         cast<StoreInst>(I)->getAlignment()));
-            ++NumStoresRemoved;
-          } else if (auto *ReplacementAlloca = dyn_cast<AllocaInst>(Repl)) {
-            ReplacementAlloca->setAlignment(
-                std::max(ReplacementAlloca->getAlignment(),
-                         cast<AllocaInst>(I)->getAlignment()));
-          } else if (isa<CallInst>(Repl)) {
-            ++NumCallsRemoved;
-          }
-
-          if (NewMemAcc) {
-            // Update the uses of the old MSSA access with NewMemAcc.
-            MemoryAccess *OldMA = MSSA->getMemoryAccess(I);
-            OldMA->replaceAllUsesWith(NewMemAcc);
-            MSSAUpdater->removeMemoryAccess(OldMA);
-          }
-
-          Repl->andIRFlags(I);
-          combineKnownMetadata(Repl, I);
-          I->replaceAllUsesWith(Repl);
-          // Also invalidate the Alias Analysis cache.
-          MD->removeInstruction(I);
-          I->eraseFromParent();
-        }
-
-      // Remove MemorySSA phi nodes with the same arguments.
-      if (NewMemAcc) {
-        SmallPtrSet<MemoryPhi *, 4> UsePhis;
-        for (User *U : NewMemAcc->users())
-          if (MemoryPhi *Phi = dyn_cast<MemoryPhi>(U))
-            UsePhis.insert(Phi);
-
-        for (auto *Phi : UsePhis) {
-          auto In = Phi->incoming_values();
-          if (all_of(In, [&](Use &U) { return U == NewMemAcc; })) {
-            Phi->replaceAllUsesWith(NewMemAcc);
-            MSSAUpdater->removeMemoryAccess(Phi);
-          }
-        }
-      }
     }
 
     NumHoisted += NL + NS + NC + NI;
@@ -935,8 +1091,8 @@ private:
         // If I1 cannot guarantee progress, subsequent instructions
         // in BB cannot be hoisted anyways.
         if (!isGuaranteedToTransferExecutionToSuccessor(&I1)) {
-           HoistBarrier.insert(BB);
-           break;
+          HoistBarrier.insert(BB);
+          break;
         }
         // Only hoist the first instructions in BB up to MaxDepthInBB. Hoisting
         // deeper may increase the register pressure and compilation time.
@@ -954,7 +1110,8 @@ private:
         else if (auto *Call = dyn_cast<CallInst>(&I1)) {
           if (auto *Intr = dyn_cast<IntrinsicInst>(Call)) {
             if (isa<DbgInfoIntrinsic>(Intr) ||
-                Intr->getIntrinsicID() == Intrinsic::assume)
+                Intr->getIntrinsicID() == Intrinsic::assume ||
+                Intr->getIntrinsicID() == Intrinsic::sideeffect)
               continue;
           }
           if (Call->mayHaveSideEffects())
@@ -996,16 +1153,18 @@ public:
     if (skipFunction(F))
       return false;
     auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
     auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
     auto &MD = getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
     auto &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
 
-    GVNHoist G(&DT, &AA, &MD, &MSSA);
+    GVNHoist G(&DT, &PDT, &AA, &MD, &MSSA);
     return G.run(F);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<PostDominatorTreeWrapperPass>();
     AU.addRequired<AAResultsWrapperPass>();
     AU.addRequired<MemoryDependenceWrapperPass>();
     AU.addRequired<MemorySSAWrapperPass>();
@@ -1014,14 +1173,16 @@ public:
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
 };
-} // namespace
+
+} // end namespace llvm
 
 PreservedAnalyses GVNHoistPass::run(Function &F, FunctionAnalysisManager &AM) {
   DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
   AliasAnalysis &AA = AM.getResult<AAManager>(F);
   MemoryDependenceResults &MD = AM.getResult<MemoryDependenceAnalysis>(F);
   MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
-  GVNHoist G(&DT, &AA, &MD, &MSSA);
+  GVNHoist G(&DT, &PDT, &AA, &MD, &MSSA);
   if (!G.run(F))
     return PreservedAnalyses::all();
 
@@ -1033,6 +1194,7 @@ PreservedAnalyses GVNHoistPass::run(Function &F, FunctionAnalysisManager &AM) {
 }
 
 char GVNHoistLegacyPass::ID = 0;
+
 INITIALIZE_PASS_BEGIN(GVNHoistLegacyPass, "gvn-hoist",
                       "Early GVN Hoisting of Expressions", false, false)
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
diff --git a/lib/Transforms/Scalar/GVNSink.cpp b/lib/Transforms/Scalar/GVNSink.cpp
index 5fd2dfc118b4..814a62cd7d65 100644
--- a/lib/Transforms/Scalar/GVNSink.cpp
+++ b/lib/Transforms/Scalar/GVNSink.cpp
@@ -1,4 +1,4 @@
-//===- GVNSink.cpp - sink expressions into successors -------------------===//
+//===- GVNSink.cpp - sink expressions into successors ---------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -31,33 +31,54 @@
 /// replace %a1 with %c1, will it contribute in an equivalent way to all
 /// successive instructions?". The PostValueTable class in GVN provides this
 /// mapping.
-///
+//
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/MemorySSA.h"
-#include "llvm/Analysis/PostDominators.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/Support/MathExtras.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/ArrayRecycler.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Scalar/GVNExpression.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include <unordered_set>
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <utility>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "gvn-sink"
@@ -72,8 +93,8 @@ LLVM_DUMP_METHOD void Expression::dump() const {
   dbgs() << "\n";
 }
 
-}
-}
+} // end namespace GVNExpression
+} // end namespace llvm
 
 namespace {
 
@@ -97,7 +118,7 @@ static bool isMemoryInst(const Instruction *I) {
 /// list returned by operator*.
 class LockstepReverseIterator {
   ArrayRef<BasicBlock *> Blocks;
-  SmallPtrSet<BasicBlock *, 4> ActiveBlocks;
+  SmallSetVector<BasicBlock *, 4> ActiveBlocks;
   SmallVector<Instruction *, 4> Insts;
   bool Fail;
 
@@ -115,7 +136,7 @@ public:
     for (BasicBlock *BB : Blocks) {
       if (BB->size() <= 1) {
         // Block wasn't big enough - only contained a terminator.
-        ActiveBlocks.erase(BB);
+        ActiveBlocks.remove(BB);
         continue;
       }
       Insts.push_back(BB->getTerminator()->getPrevNode());
@@ -126,13 +147,20 @@ public:
 
   bool isValid() const { return !Fail; }
   ArrayRef<Instruction *> operator*() const { return Insts; }
-  SmallPtrSet<BasicBlock *, 4> &getActiveBlocks() { return ActiveBlocks; }
 
-  void restrictToBlocks(SmallPtrSetImpl<BasicBlock *> &Blocks) {
+  // Note: This needs to return a SmallSetVector as the elements of
+  // ActiveBlocks will be later copied to Blocks using std::copy. The
+  // resultant order of elements in Blocks needs to be deterministic.
+  // Using SmallPtrSet instead causes non-deterministic order while
+  // copying. And we cannot simply sort Blocks as they need to match the
+  // corresponding Values.
+  SmallSetVector<BasicBlock *, 4> &getActiveBlocks() { return ActiveBlocks; }
+
+  void restrictToBlocks(SmallSetVector<BasicBlock *, 4> &Blocks) {
     for (auto II = Insts.begin(); II != Insts.end();) {
       if (std::find(Blocks.begin(), Blocks.end(), (*II)->getParent()) ==
           Blocks.end()) {
-        ActiveBlocks.erase((*II)->getParent());
+        ActiveBlocks.remove((*II)->getParent());
         II = Insts.erase(II);
       } else {
         ++II;
@@ -146,7 +174,7 @@ public:
     SmallVector<Instruction *, 4> NewInsts;
     for (auto *Inst : Insts) {
       if (Inst == &Inst->getParent()->front())
-        ActiveBlocks.erase(Inst->getParent());
+        ActiveBlocks.remove(Inst->getParent());
       else
         NewInsts.push_back(Inst->getPrevNode());
     }
@@ -180,14 +208,14 @@ struct SinkingInstructionCandidate {
             NumExtraPHIs) // PHIs are expensive, so make sure they're worth it.
            - SplitEdgeCost;
   }
+
   bool operator>(const SinkingInstructionCandidate &Other) const {
     return Cost > Other.Cost;
   }
 };
 
 #ifndef NDEBUG
-llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
-                              const SinkingInstructionCandidate &C) {
+raw_ostream &operator<<(raw_ostream &OS, const SinkingInstructionCandidate &C) {
   OS << "<Candidate Cost=" << C.Cost << " #Blocks=" << C.NumBlocks
      << " #Insts=" << C.NumInstructions << " #PHIs=" << C.NumPHIs << ">";
   return OS;
@@ -204,17 +232,20 @@ class ModelledPHI {
   SmallVector<BasicBlock *, 4> Blocks;
 
 public:
-  ModelledPHI() {}
+  ModelledPHI() = default;
+
   ModelledPHI(const PHINode *PN) {
+    // BasicBlock comes first so we sort by basic block pointer order, then by value pointer order.
+    SmallVector<std::pair<BasicBlock *, Value *>, 4> Ops;
     for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I)
-      Blocks.push_back(PN->getIncomingBlock(I));
-    std::sort(Blocks.begin(), Blocks.end());
-
-    // This assumes the PHI is already well-formed and there aren't conflicting
-    // incoming values for the same block.
-    for (auto *B : Blocks)
-      Values.push_back(PN->getIncomingValueForBlock(B));
+      Ops.push_back({PN->getIncomingBlock(I), PN->getIncomingValue(I)});
+    std::sort(Ops.begin(), Ops.end());
+    for (auto &P : Ops) {
+      Blocks.push_back(P.first);
+      Values.push_back(P.second);
+    }
   }
+
   /// Create a dummy ModelledPHI that will compare unequal to any other ModelledPHI
   /// without the same ID.
   /// \note This is specifically for DenseMapInfo - do not use this!
@@ -241,7 +272,7 @@ public:
 
   /// Restrict the PHI's contents down to only \c NewBlocks.
   /// \c NewBlocks must be a subset of \c this->Blocks.
-  void restrictToBlocks(const SmallPtrSetImpl<BasicBlock *> &NewBlocks) {
+  void restrictToBlocks(const SmallSetVector<BasicBlock *, 4> &NewBlocks) {
     auto BI = Blocks.begin();
     auto VI = Values.begin();
     while (BI != Blocks.end()) {
@@ -261,19 +292,23 @@ public:
   ArrayRef<Value *> getValues() const { return Values; }
 
   bool areAllIncomingValuesSame() const {
-    return all_of(Values, [&](Value *V) { return V == Values[0]; });
+    return llvm::all_of(Values, [&](Value *V) { return V == Values[0]; });
   }
+
   bool areAllIncomingValuesSameType() const {
-    return all_of(
+    return llvm::all_of(
         Values, [&](Value *V) { return V->getType() == Values[0]->getType(); });
   }
+
   bool areAnyIncomingValuesConstant() const {
-    return any_of(Values, [&](Value *V) { return isa<Constant>(V); });
+    return llvm::any_of(Values, [&](Value *V) { return isa<Constant>(V); });
   }
+
   // Hash functor
   unsigned hash() const {
       return (unsigned)hash_combine_range(Values.begin(), Values.end());
   }
+
   bool operator==(const ModelledPHI &Other) const {
     return Values == Other.Values && Blocks == Other.Blocks;
   }
@@ -284,17 +319,20 @@ template <typename ModelledPHI> struct DenseMapInfo {
     static ModelledPHI Dummy = ModelledPHI::createDummy(0);
     return Dummy;
   }
+
   static inline ModelledPHI &getTombstoneKey() {
     static ModelledPHI Dummy = ModelledPHI::createDummy(1);
     return Dummy;
   }
+
   static unsigned getHashValue(const ModelledPHI &V) { return V.hash(); }
+
   static bool isEqual(const ModelledPHI &LHS, const ModelledPHI &RHS) {
     return LHS == RHS;
   }
 };
 
-typedef DenseSet<ModelledPHI, DenseMapInfo<ModelledPHI>> ModelledPHISet;
+using ModelledPHISet = DenseSet<ModelledPHI, DenseMapInfo<ModelledPHI>>;
 
 //===----------------------------------------------------------------------===//
 //                             ValueTable
@@ -325,10 +363,11 @@ public:
       op_push_back(U.getUser());
     std::sort(op_begin(), op_end());
   }
+
   void setMemoryUseOrder(unsigned MUO) { MemoryUseOrder = MUO; }
   void setVolatile(bool V) { Volatile = V; }
 
-  virtual hash_code getHashValue() const {
+  hash_code getHashValue() const override {
     return hash_combine(GVNExpression::BasicExpression::getHashValue(),
                         MemoryUseOrder, Volatile);
   }
@@ -348,7 +387,7 @@ class ValueTable {
   DenseMap<size_t, uint32_t> HashNumbering;
   BumpPtrAllocator Allocator;
   ArrayRecycler<Value *> Recycler;
-  uint32_t nextValueNumber;
+  uint32_t nextValueNumber = 1;
 
   /// Create an expression for I based on its opcode and its uses. If I
   /// touches or reads memory, the expression is also based upon its memory
@@ -378,6 +417,8 @@ class ValueTable {
   }
 
 public:
+  ValueTable() = default;
+
   /// Returns the value number for the specified value, assigning
   /// it a new number if it did not have one before.
   uint32_t lookupOrAdd(Value *V) {
@@ -483,8 +524,6 @@ public:
     nextValueNumber = 1;
   }
 
-  ValueTable() : nextValueNumber(1) {}
-
   /// \c Inst uses or touches memory. Return an ID describing the memory state
   /// at \c Inst such that if getMemoryUseOrder(I1) == getMemoryUseOrder(I2),
   /// the exact same memory operations happen after I1 and I2.
@@ -519,7 +558,8 @@ public:
 
 class GVNSink {
 public:
-  GVNSink() : VN() {}
+  GVNSink() = default;
+
   bool run(Function &F) {
     DEBUG(dbgs() << "GVNSink: running on function @" << F.getName() << "\n");
 
@@ -576,8 +616,9 @@ private:
   void foldPointlessPHINodes(BasicBlock *BB) {
     auto I = BB->begin();
     while (PHINode *PN = dyn_cast<PHINode>(I++)) {
-      if (!all_of(PN->incoming_values(),
-                  [&](const Value *V) { return V == PN->getIncomingValue(0); }))
+      if (!llvm::all_of(PN->incoming_values(), [&](const Value *V) {
+            return V == PN->getIncomingValue(0);
+          }))
         continue;
       if (PN->getIncomingValue(0) != PN)
         PN->replaceAllUsesWith(PN->getIncomingValue(0));
@@ -624,7 +665,7 @@ Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking(
   SmallVector<Instruction *, 4> NewInsts;
   for (auto *I : Insts) {
     if (VN.lookup(I) != VNumToSink)
-      ActivePreds.erase(I->getParent());
+      ActivePreds.remove(I->getParent());
     else
       NewInsts.push_back(I);
   }
@@ -794,7 +835,7 @@ void GVNSink::sinkLastInstruction(ArrayRef<BasicBlock *> Blocks,
 
   SmallVector<Value *, 4> NewOperands;
   for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) {
-    bool NeedPHI = any_of(Insts, [&I0, O](const Instruction *I) {
+    bool NeedPHI = llvm::any_of(Insts, [&I0, O](const Instruction *I) {
       return I->getOperand(O) != I0->getOperand(O);
     });
     if (!NeedPHI) {
@@ -860,7 +901,8 @@ public:
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
 };
-} // namespace
+
+} // end anonymous namespace
 
 PreservedAnalyses GVNSinkPass::run(Function &F, FunctionAnalysisManager &AM) {
   GVNSink G;
@@ -873,6 +915,7 @@ PreservedAnalyses GVNSinkPass::run(Function &F, FunctionAnalysisManager &AM) {
 }
 
 char GVNSinkLegacyPass::ID = 0;
+
 INITIALIZE_PASS_BEGIN(GVNSinkLegacyPass, "gvn-sink",
                       "Early GVN sinking of Expressions", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
diff --git a/lib/Transforms/Scalar/GuardWidening.cpp b/lib/Transforms/Scalar/GuardWidening.cpp
index fb7c6e15758d..c4aeccb85ca7 100644
--- a/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/lib/Transforms/Scalar/GuardWidening.cpp
@@ -664,6 +664,7 @@ PreservedAnalyses GuardWideningPass::run(Function &F,
   return PA;
 }
 
+#ifndef NDEBUG
 StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) {
   switch (WS) {
   case WS_IllegalOrNegative:
@@ -678,6 +679,7 @@ StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) {
 
   llvm_unreachable("Fully covered switch above!");
 }
+#endif
 
 char GuardWideningLegacyPass::ID = 0;
 
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 10782963177c..74d6014d3e3d 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -25,27 +25,54 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/IndVarSimplify.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
@@ -53,6 +80,10 @@
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
+#include <cassert>
+#include <cstdint>
+#include <utility>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "indvars"
@@ -91,6 +122,7 @@ DisableLFTR("disable-lftr", cl::Hidden, cl::init(false),
             cl::desc("Disable Linear Function Test Replace optimization"));
 
 namespace {
+
 struct RewritePhi;
 
 class IndVarSimplify {
@@ -131,7 +163,8 @@ public:
 
   bool run(Loop *L);
 };
-}
+
+} // end anonymous namespace
 
 /// Return true if the SCEV expansion generated by the rewriter can replace the
 /// original value. SCEV guarantees that it produces the same value, but the way
@@ -251,7 +284,6 @@ static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) {
 /// is converted into
 /// for(int i = 0; i < 10000; ++i)
 ///   bar((double)i);
-///
 void IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) {
   unsigned IncomingEdge = L->contains(PN->getIncomingBlock(0));
   unsigned BackEdge     = IncomingEdge^1;
@@ -305,7 +337,6 @@ void IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) {
        L->contains(TheBr->getSuccessor(1))))
     return;
 
-
   // If it isn't a comparison with an integer-as-fp (the exit value), we can't
   // transform it.
   ConstantFP *ExitValueVal = dyn_cast<ConstantFP>(Compare->getOperand(1));
@@ -373,7 +404,6 @@ void IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) {
     // transform the IV.
     if (Leftover != 0 && int32_t(ExitValue+IncValue) < ExitValue)
       return;
-
   } else {
     // If we have a negative stride, we require the init to be greater than the
     // exit value.
@@ -452,7 +482,6 @@ void IndVarSimplify::rewriteNonIntegerIVs(Loop *L) {
   // First step.  Check to see if there are any floating-point recurrences.
   // If there are, change them into integer recurrences, permitting analysis by
   // the SCEV routines.
-  //
   BasicBlock *Header = L->getHeader();
 
   SmallVector<WeakTrackingVH, 8> PHIs;
@@ -472,18 +501,26 @@ void IndVarSimplify::rewriteNonIntegerIVs(Loop *L) {
 }
 
 namespace {
+
 // Collect information about PHI nodes which can be transformed in
 // rewriteLoopExitValues.
 struct RewritePhi {
   PHINode *PN;
-  unsigned Ith;  // Ith incoming value.
-  Value *Val;    // Exit value after expansion.
-  bool HighCost; // High Cost when expansion.
+
+  // Ith incoming value.
+  unsigned Ith;
+
+  // Exit value after expansion.
+  Value *Val;
+
+  // High Cost when expansion.
+  bool HighCost;
 
   RewritePhi(PHINode *P, unsigned I, Value *V, bool H)
       : PN(P), Ith(I), Val(V), HighCost(H) {}
 };
-}
+
+} // end anonymous namespace
 
 Value *IndVarSimplify::expandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S,
                                           Loop *L, Instruction *InsertPt,
@@ -747,7 +784,6 @@ void IndVarSimplify::rewriteFirstIterationLoopExitValues(Loop *L) {
 /// aggressively.
 bool IndVarSimplify::canLoopBeDeleted(
     Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet) {
-
   BasicBlock *Preheader = L->getLoopPreheader();
   // If there is no preheader, the loop will not be deleted.
   if (!Preheader)
@@ -790,7 +826,9 @@ bool IndVarSimplify::canLoopBeDeleted(
   }
 
   for (auto *BB : L->blocks())
-    if (any_of(*BB, [](Instruction &I) { return I.mayHaveSideEffects(); }))
+    if (llvm::any_of(*BB, [](Instruction &I) {
+          return I.mayHaveSideEffects();
+        }))
       return false;
 
   return true;
@@ -801,15 +839,21 @@ bool IndVarSimplify::canLoopBeDeleted(
 //===----------------------------------------------------------------------===//
 
 namespace {
+
 // Collect information about induction variables that are used by sign/zero
 // extend operations. This information is recorded by CollectExtend and provides
 // the input to WidenIV.
 struct WideIVInfo {
   PHINode *NarrowIV = nullptr;
-  Type *WidestNativeType = nullptr; // Widest integer type created [sz]ext
-  bool IsSigned = false;            // Was a sext user seen before a zext?
+
+  // Widest integer type created [sz]ext
+  Type *WidestNativeType = nullptr;
+
+  // Was a sext user seen before a zext?
+  bool IsSigned = false;
 };
-}
+
+} // end anonymous namespace
 
 /// Update information about the induction variable that is extended by this
 /// sign or zero extend operation. This is used to determine the final width of
@@ -885,7 +929,6 @@ struct NarrowIVDefUse {
 /// creating any new induction variables. To do this, it creates a new phi of
 /// the wider type and redirects all users, either removing extends or inserting
 /// truncs whenever we stop propagating the type.
-///
 class WidenIV {
   // Parameters
   PHINode *OrigPhi;
@@ -902,22 +945,24 @@ class WidenIV {
   bool HasGuards;
 
   // Result
-  PHINode *WidePhi;
-  Instruction *WideInc;
-  const SCEV *WideIncExpr;
+  PHINode *WidePhi = nullptr;
+  Instruction *WideInc = nullptr;
+  const SCEV *WideIncExpr = nullptr;
   SmallVectorImpl<WeakTrackingVH> &DeadInsts;
 
   SmallPtrSet<Instruction *,16> Widened;
   SmallVector<NarrowIVDefUse, 8> NarrowIVUsers;
 
   enum ExtendKind { ZeroExtended, SignExtended, Unknown };
+
   // A map tracking the kind of extension used to widen each narrow IV
   // and narrow IV user.
   // Key: pointer to a narrow IV or IV user.
   // Value: the kind of extension used to widen this Instruction.
   DenseMap<AssertingVH<Instruction>, ExtendKind> ExtendKindMap;
 
-  typedef std::pair<AssertingVH<Value>, AssertingVH<Instruction>> DefUserPair;
+  using DefUserPair = std::pair<AssertingVH<Value>, AssertingVH<Instruction>>;
+
   // A map with control-dependent ranges for post increment IV uses. The key is
   // a pair of IV def and a use of this def denoting the context. The value is
   // a ConstantRange representing possible values of the def at the given
@@ -935,6 +980,7 @@ class WidenIV {
 
   void calculatePostIncRanges(PHINode *OrigPhi);
   void calculatePostIncRange(Instruction *NarrowDef, Instruction *NarrowUser);
+
   void updatePostIncRangeInfo(Value *Def, Instruction *UseI, ConstantRange R) {
     DefUserPair Key(Def, UseI);
     auto It = PostIncRangeInfos.find(Key);
@@ -950,8 +996,7 @@ public:
           bool HasGuards)
       : OrigPhi(WI.NarrowIV), WideType(WI.WidestNativeType), LI(LInfo),
         L(LI->getLoopFor(OrigPhi->getParent())), SE(SEv), DT(DTree),
-        HasGuards(HasGuards), WidePhi(nullptr), WideInc(nullptr),
-        WideIncExpr(nullptr), DeadInsts(DI) {
+        HasGuards(HasGuards), DeadInsts(DI) {
     assert(L->getHeader() == OrigPhi->getParent() && "Phi must be an IV");
     ExtendKindMap[OrigPhi] = WI.IsSigned ? SignExtended : ZeroExtended;
   }
@@ -969,7 +1014,7 @@ protected:
 
   ExtendKind getExtendKind(Instruction *I);
 
-  typedef std::pair<const SCEVAddRecExpr *, ExtendKind> WidenedRecTy;
+  using WidenedRecTy = std::pair<const SCEVAddRecExpr *, ExtendKind>;
 
   WidenedRecTy getWideRecurrence(NarrowIVDefUse DU);
 
@@ -984,7 +1029,8 @@ protected:
 
   void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef);
 };
-} // anonymous namespace
+
+} // end anonymous namespace
 
 /// Perform a quick domtree based check for loop invariance assuming that V is
 /// used within the loop. LoopInfo::isLoopInvariant() seems gratuitous for this
@@ -1182,7 +1228,6 @@ const SCEV *WidenIV::getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
 /// operands is an AddRec for this loop, return the AddRec and the kind of
 /// extension used.
 WidenIV::WidenedRecTy WidenIV::getExtendedOperandRecurrence(NarrowIVDefUse DU) {
-
   // Handle the common case of add<nsw/nuw>
   const unsigned OpCode = DU.NarrowUse->getOpcode();
   // Only Add/Sub/Mul instructions supported yet.
@@ -1310,7 +1355,7 @@ bool WidenIV::widenLoopCompare(NarrowIVDefUse DU) {
   Value *Op = Cmp->getOperand(Cmp->getOperand(0) == DU.NarrowDef ? 1 : 0);
   unsigned CastWidth = SE->getTypeSizeInBits(Op->getType());
   unsigned IVWidth = SE->getTypeSizeInBits(WideType);
-  assert (CastWidth <= IVWidth && "Unexpected width while widening compare.");
+  assert(CastWidth <= IVWidth && "Unexpected width while widening compare.");
 
   // Widen the compare instruction.
   IRBuilder<> Builder(
@@ -1461,7 +1506,6 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
 }
 
 /// Add eligible users of NarrowDef to NarrowIVUsers.
-///
 void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) {
   const SCEV *NarrowSCEV = SE->getSCEV(NarrowDef);
   bool NonNegativeDef =
@@ -1494,7 +1538,6 @@ void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) {
 ///
 /// It would be simpler to delete uses as they are processed, but we must avoid
 /// invalidating SCEV expressions.
-///
 PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
   // Is this phi an induction variable?
   const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(OrigPhi));
@@ -1581,6 +1624,15 @@ PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
     if (DU.NarrowDef->use_empty())
       DeadInsts.emplace_back(DU.NarrowDef);
   }
+
+  // Attach any debug information to the new PHI. Since OrigPhi and WidePHI
+  // evaluate the same recurrence, we can just copy the debug info over.
+  SmallVector<DbgValueInst *, 1> DbgValues;
+  llvm::findDbgValues(DbgValues, OrigPhi);
+  auto *MDPhi = MetadataAsValue::get(WidePhi->getContext(),
+                                     ValueAsMetadata::get(WidePhi));
+  for (auto &DbgValue : DbgValues)
+    DbgValue->setOperand(0, MDPhi);
   return WidePhi;
 }
 
@@ -1696,12 +1748,12 @@ void WidenIV::calculatePostIncRanges(PHINode *OrigPhi) {
 //  Live IV Reduction - Minimize IVs live across the loop.
 //===----------------------------------------------------------------------===//
 
-
 //===----------------------------------------------------------------------===//
 //  Simplification of IV users based on SCEV evaluation.
 //===----------------------------------------------------------------------===//
 
 namespace {
+
 class IndVarSimplifyVisitor : public IVVisitor {
   ScalarEvolution *SE;
   const TargetTransformInfo *TTI;
@@ -1721,14 +1773,14 @@ public:
   // Implement the interface used by simplifyUsersOfIV.
   void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); }
 };
-}
+
+} // end anonymous namespace
 
 /// Iteratively perform simplification on a worklist of IV users. Each
 /// successive simplification may push more users which may themselves be
 /// candidates for simplification.
 ///
 /// Sign/Zero extend elimination is interleaved with IV simplification.
-///
 void IndVarSimplify::simplifyAndExtend(Loop *L,
                                        SCEVExpander &Rewriter,
                                        LoopInfo *LI) {
@@ -1759,7 +1811,8 @@ void IndVarSimplify::simplifyAndExtend(Loop *L,
       // Information about sign/zero extensions of CurrIV.
       IndVarSimplifyVisitor Visitor(CurrIV, SE, TTI, DT);
 
-      Changed |= simplifyUsersOfIV(CurrIV, SE, DT, LI, DeadInsts, &Visitor);
+      Changed |=
+          simplifyUsersOfIV(CurrIV, SE, DT, LI, DeadInsts, Rewriter, &Visitor);
 
       if (Visitor.WI.WidestNativeType) {
         WideIVs.push_back(Visitor.WI);
@@ -2501,8 +2554,10 @@ PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
 }
 
 namespace {
+
 struct IndVarSimplifyLegacyPass : public LoopPass {
   static char ID; // Pass identification, replacement for typeid
+
   IndVarSimplifyLegacyPass() : LoopPass(ID) {
     initializeIndVarSimplifyLegacyPassPass(*PassRegistry::getPassRegistry());
   }
@@ -2529,9 +2584,11 @@ struct IndVarSimplifyLegacyPass : public LoopPass {
     getLoopAnalysisUsage(AU);
   }
 };
-}
+
+} // end anonymous namespace
 
 char IndVarSimplifyLegacyPass::ID = 0;
+
 INITIALIZE_PASS_BEGIN(IndVarSimplifyLegacyPass, "indvars",
                       "Induction Variable Simplification", false, false)
 INITIALIZE_PASS_DEPENDENCY(LoopPass)
diff --git a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index 99b4458ea0fa..5c4d55bfbb2b 100644
--- a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -1,4 +1,4 @@
-//===-- InductiveRangeCheckElimination.cpp - ------------------------------===//
+//===- InductiveRangeCheckElimination.cpp - -------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,6 +6,7 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
+//
 // The InductiveRangeCheckElimination pass splits a loop's iteration space into
 // three disjoint ranges.  It does that in a way such that the loop running in
 // the middle loop provably does not need range checks. As an example, it will
@@ -39,30 +40,61 @@
 //       throw_out_of_bounds();
 //     }
 //   }
+//
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <limits>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
+using namespace llvm::PatternMatch;
 
 static cl::opt<unsigned> LoopSizeCutoff("irce-loop-size-cutoff", cl::Hidden,
                                         cl::init(64));
@@ -79,6 +111,9 @@ static cl::opt<int> MaxExitProbReciprocal("irce-max-exit-prob-reciprocal",
 static cl::opt<bool> SkipProfitabilityChecks("irce-skip-profitability-checks",
                                              cl::Hidden, cl::init(false));
 
+static cl::opt<bool> AllowUnsignedLatchCondition("irce-allow-unsigned-latch",
+                                                 cl::Hidden, cl::init(true));
+
 static const char *ClonedLoopTag = "irce.loop.clone";
 
 #define DEBUG_TYPE "irce"
@@ -114,15 +149,16 @@ class InductiveRangeCheck {
 
   static StringRef rangeCheckKindToStr(RangeCheckKind);
 
-  const SCEV *Offset = nullptr;
-  const SCEV *Scale = nullptr;
-  Value *Length = nullptr;
+  const SCEV *Begin = nullptr;
+  const SCEV *Step = nullptr;
+  const SCEV *End = nullptr;
   Use *CheckUse = nullptr;
   RangeCheckKind Kind = RANGE_CHECK_UNKNOWN;
+  bool IsSigned = true;
 
   static RangeCheckKind parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
                                             ScalarEvolution &SE, Value *&Index,
-                                            Value *&Length);
+                                            Value *&Length, bool &IsSigned);
 
   static void
   extractRangeChecksFromCond(Loop *L, ScalarEvolution &SE, Use &ConditionUse,
@@ -130,20 +166,21 @@ class InductiveRangeCheck {
                              SmallPtrSetImpl<Value *> &Visited);
 
 public:
-  const SCEV *getOffset() const { return Offset; }
-  const SCEV *getScale() const { return Scale; }
-  Value *getLength() const { return Length; }
+  const SCEV *getBegin() const { return Begin; }
+  const SCEV *getStep() const { return Step; }
+  const SCEV *getEnd() const { return End; }
+  bool isSigned() const { return IsSigned; }
 
   void print(raw_ostream &OS) const {
     OS << "InductiveRangeCheck:\n";
     OS << "  Kind: " << rangeCheckKindToStr(Kind) << "\n";
-    OS << "  Offset: ";
-    Offset->print(OS);
-    OS << "  Scale: ";
-    Scale->print(OS);
-    OS << "  Length: ";
-    if (Length)
-      Length->print(OS);
+    OS << "  Begin: ";
+    Begin->print(OS);
+    OS << "  Step: ";
+    Step->print(OS);
+    OS << "  End: ";
+    if (End)
+      End->print(OS);
     else
       OS << "(null)";
     OS << "\n  CheckUse: ";
@@ -173,6 +210,14 @@ public:
     Type *getType() const { return Begin->getType(); }
     const SCEV *getBegin() const { return Begin; }
     const SCEV *getEnd() const { return End; }
+    bool isEmpty(ScalarEvolution &SE, bool IsSigned) const {
+      if (Begin == End)
+        return true;
+      if (IsSigned)
+        return SE.isKnownPredicate(ICmpInst::ICMP_SGE, Begin, End);
+      else
+        return SE.isKnownPredicate(ICmpInst::ICMP_UGE, Begin, End);
+    }
   };
 
   /// This is the value the condition of the branch needs to evaluate to for the
@@ -183,7 +228,8 @@ public:
   /// check is redundant and can be constant-folded away.  The induction
   /// variable is not required to be the canonical {0,+,1} induction variable.
   Optional<Range> computeSafeIterationSpace(ScalarEvolution &SE,
-                                            const SCEVAddRecExpr *IndVar) const;
+                                            const SCEVAddRecExpr *IndVar,
+                                            bool IsLatchSigned) const;
 
   /// Parse out a set of inductive range checks from \p BI and append them to \p
   /// Checks.
@@ -199,6 +245,7 @@ public:
 class InductiveRangeCheckElimination : public LoopPass {
 public:
   static char ID;
+
   InductiveRangeCheckElimination() : LoopPass(ID) {
     initializeInductiveRangeCheckEliminationPass(
         *PassRegistry::getPassRegistry());
@@ -212,8 +259,9 @@ public:
   bool runOnLoop(Loop *L, LPPassManager &LPM) override;
 };
 
+} // end anonymous namespace
+
 char InductiveRangeCheckElimination::ID = 0;
-}
 
 INITIALIZE_PASS_BEGIN(InductiveRangeCheckElimination, "irce",
                       "Inductive range check elimination", false, false)
@@ -247,12 +295,10 @@ StringRef InductiveRangeCheck::rangeCheckKindToStr(
 /// range checked, and set `Length` to the upper limit `Index` is being range
 /// checked with if (and only if) the range check type is stronger or equal to
 /// RANGE_CHECK_UPPER.
-///
 InductiveRangeCheck::RangeCheckKind
 InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
                                          ScalarEvolution &SE, Value *&Index,
-                                         Value *&Length) {
-
+                                         Value *&Length, bool &IsSigned) {
   auto IsNonNegativeAndNotLoopVarying = [&SE, L](Value *V) {
     const SCEV *S = SE.getSCEV(V);
     if (isa<SCEVCouldNotCompute>(S))
@@ -262,8 +308,6 @@ InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
            SE.isKnownNonNegative(S);
   };
 
-  using namespace llvm::PatternMatch;
-
   ICmpInst::Predicate Pred = ICI->getPredicate();
   Value *LHS = ICI->getOperand(0);
   Value *RHS = ICI->getOperand(1);
@@ -276,6 +320,7 @@ InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
     std::swap(LHS, RHS);
     LLVM_FALLTHROUGH;
   case ICmpInst::ICMP_SGE:
+    IsSigned = true;
     if (match(RHS, m_ConstantInt<0>())) {
       Index = LHS;
       return RANGE_CHECK_LOWER;
@@ -286,6 +331,7 @@ InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
     std::swap(LHS, RHS);
     LLVM_FALLTHROUGH;
   case ICmpInst::ICMP_SGT:
+    IsSigned = true;
     if (match(RHS, m_ConstantInt<-1>())) {
       Index = LHS;
       return RANGE_CHECK_LOWER;
@@ -302,6 +348,7 @@ InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
     std::swap(LHS, RHS);
     LLVM_FALLTHROUGH;
   case ICmpInst::ICMP_UGT:
+    IsSigned = false;
     if (IsNonNegativeAndNotLoopVarying(LHS)) {
       Index = RHS;
       Length = LHS;
@@ -317,42 +364,16 @@ void InductiveRangeCheck::extractRangeChecksFromCond(
     Loop *L, ScalarEvolution &SE, Use &ConditionUse,
     SmallVectorImpl<InductiveRangeCheck> &Checks,
     SmallPtrSetImpl<Value *> &Visited) {
-  using namespace llvm::PatternMatch;
-
   Value *Condition = ConditionUse.get();
   if (!Visited.insert(Condition).second)
     return;
 
+  // TODO: Do the same for OR, XOR, NOT etc?
   if (match(Condition, m_And(m_Value(), m_Value()))) {
-    SmallVector<InductiveRangeCheck, 8> SubChecks;
     extractRangeChecksFromCond(L, SE, cast<User>(Condition)->getOperandUse(0),
-                               SubChecks, Visited);
+                               Checks, Visited);
     extractRangeChecksFromCond(L, SE, cast<User>(Condition)->getOperandUse(1),
-                               SubChecks, Visited);
-
-    if (SubChecks.size() == 2) {
-      // Handle a special case where we know how to merge two checks separately
-      // checking the upper and lower bounds into a full range check.
-      const auto &RChkA = SubChecks[0];
-      const auto &RChkB = SubChecks[1];
-      if ((RChkA.Length == RChkB.Length || !RChkA.Length || !RChkB.Length) &&
-          RChkA.Offset == RChkB.Offset && RChkA.Scale == RChkB.Scale) {
-
-        // If RChkA.Kind == RChkB.Kind then we just found two identical checks.
-        // But if one of them is a RANGE_CHECK_LOWER and the other is a
-        // RANGE_CHECK_UPPER (only possibility if they're different) then
-        // together they form a RANGE_CHECK_BOTH.
-        SubChecks[0].Kind =
-            (InductiveRangeCheck::RangeCheckKind)(RChkA.Kind | RChkB.Kind);
-        SubChecks[0].Length = RChkA.Length ? RChkA.Length : RChkB.Length;
-        SubChecks[0].CheckUse = &ConditionUse;
-
-        // We updated one of the checks in place, now erase the other.
-        SubChecks.pop_back();
-      }
-    }
-
-    Checks.insert(Checks.end(), SubChecks.begin(), SubChecks.end());
+                               Checks, Visited);
     return;
   }
 
@@ -361,7 +382,8 @@ void InductiveRangeCheck::extractRangeChecksFromCond(
     return;
 
   Value *Length = nullptr, *Index;
-  auto RCKind = parseRangeCheckICmp(L, ICI, SE, Index, Length);
+  bool IsSigned;
+  auto RCKind = parseRangeCheckICmp(L, ICI, SE, Index, Length, IsSigned);
   if (RCKind == InductiveRangeCheck::RANGE_CHECK_UNKNOWN)
     return;
 
@@ -373,18 +395,18 @@ void InductiveRangeCheck::extractRangeChecksFromCond(
     return;
 
   InductiveRangeCheck IRC;
-  IRC.Length = Length;
-  IRC.Offset = IndexAddRec->getStart();
-  IRC.Scale = IndexAddRec->getStepRecurrence(SE);
+  IRC.End = Length ? SE.getSCEV(Length) : nullptr;
+  IRC.Begin = IndexAddRec->getStart();
+  IRC.Step = IndexAddRec->getStepRecurrence(SE);
   IRC.CheckUse = &ConditionUse;
   IRC.Kind = RCKind;
+  IRC.IsSigned = IsSigned;
   Checks.push_back(IRC);
 }
 
 void InductiveRangeCheck::extractRangeChecksFromBranch(
     BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo &BPI,
     SmallVectorImpl<InductiveRangeCheck> &Checks) {
-
   if (BI->isUnconditional() || BI->getParent() == L->getLoopLatch())
     return;
 
@@ -435,16 +457,16 @@ namespace {
 // kinds of loops we can deal with -- ones that have a single latch that is also
 // an exiting block *and* have a canonical induction variable.
 struct LoopStructure {
-  const char *Tag;
+  const char *Tag = "";
 
-  BasicBlock *Header;
-  BasicBlock *Latch;
+  BasicBlock *Header = nullptr;
+  BasicBlock *Latch = nullptr;
 
   // `Latch's terminator instruction is `LatchBr', and it's `LatchBrExitIdx'th
   // successor is `LatchExit', the exit block of the loop.
-  BranchInst *LatchBr;
-  BasicBlock *LatchExit;
-  unsigned LatchBrExitIdx;
+  BranchInst *LatchBr = nullptr;
+  BasicBlock *LatchExit = nullptr;
+  unsigned LatchBrExitIdx = std::numeric_limits<unsigned>::max();
 
   // The loop represented by this instance of LoopStructure is semantically
   // equivalent to:
@@ -452,18 +474,17 @@ struct LoopStructure {
   // intN_ty inc = IndVarIncreasing ? 1 : -1;
   // pred_ty predicate = IndVarIncreasing ? ICMP_SLT : ICMP_SGT;
   //
-  // for (intN_ty iv = IndVarStart; predicate(iv, LoopExitAt); iv = IndVarNext)
+  // for (intN_ty iv = IndVarStart; predicate(iv, LoopExitAt); iv = IndVarBase)
   //   ... body ...
 
-  Value *IndVarNext;
-  Value *IndVarStart;
-  Value *LoopExitAt;
-  bool IndVarIncreasing;
+  Value *IndVarBase = nullptr;
+  Value *IndVarStart = nullptr;
+  Value *IndVarStep = nullptr;
+  Value *LoopExitAt = nullptr;
+  bool IndVarIncreasing = false;
+  bool IsSignedPredicate = true;
 
-  LoopStructure()
-      : Tag(""), Header(nullptr), Latch(nullptr), LatchBr(nullptr),
-        LatchExit(nullptr), LatchBrExitIdx(-1), IndVarNext(nullptr),
-        IndVarStart(nullptr), LoopExitAt(nullptr), IndVarIncreasing(false) {}
+  LoopStructure() = default;
 
   template <typename M> LoopStructure map(M Map) const {
     LoopStructure Result;
@@ -473,10 +494,12 @@ struct LoopStructure {
     Result.LatchBr = cast<BranchInst>(Map(LatchBr));
     Result.LatchExit = cast<BasicBlock>(Map(LatchExit));
     Result.LatchBrExitIdx = LatchBrExitIdx;
-    Result.IndVarNext = Map(IndVarNext);
+    Result.IndVarBase = Map(IndVarBase);
     Result.IndVarStart = Map(IndVarStart);
+    Result.IndVarStep = Map(IndVarStep);
     Result.LoopExitAt = Map(LoopExitAt);
     Result.IndVarIncreasing = IndVarIncreasing;
+    Result.IsSignedPredicate = IsSignedPredicate;
     return Result;
   }
 
@@ -494,7 +517,6 @@ struct LoopStructure {
 /// loops to run any remaining iterations.  The pre loop runs any iterations in
 /// which the induction variable is < Begin, and the post loop runs any
 /// iterations in which the induction variable is >= End.
-///
 class LoopConstrainer {
   // The representation of a clone of the original loop we started out with.
   struct ClonedLoop {
@@ -511,13 +533,12 @@ class LoopConstrainer {
   // Result of rewriting the range of a loop.  See changeIterationSpaceEnd for
   // more details on what these fields mean.
   struct RewrittenRangeInfo {
-    BasicBlock *PseudoExit;
-    BasicBlock *ExitSelector;
+    BasicBlock *PseudoExit = nullptr;
+    BasicBlock *ExitSelector = nullptr;
     std::vector<PHINode *> PHIValuesAtPseudoExit;
-    PHINode *IndVarEnd;
+    PHINode *IndVarEnd = nullptr;
 
-    RewrittenRangeInfo()
-        : PseudoExit(nullptr), ExitSelector(nullptr), IndVarEnd(nullptr) {}
+    RewrittenRangeInfo() = default;
   };
 
   // Calculated subranges we restrict the iteration space of the main loop to.
@@ -541,14 +562,12 @@ class LoopConstrainer {
   // Compute a safe set of limits for the main loop to run in -- effectively the
   // intersection of `Range' and the iteration space of the original loop.
   // Return None if unable to compute the set of subranges.
-  //
-  Optional<SubRanges> calculateSubRanges() const;
+  Optional<SubRanges> calculateSubRanges(bool IsSignedPredicate) const;
 
   // Clone `OriginalLoop' and return the result in CLResult.  The IR after
   // running `cloneLoop' is well formed except for the PHI nodes in CLResult --
   // the PHI nodes say that there is an incoming edge from `OriginalPreheader`
   // but there is no such edge.
-  //
   void cloneLoop(ClonedLoop &CLResult, const char *Tag) const;
 
   // Create the appropriate loop structure needed to describe a cloned copy of
@@ -577,7 +596,6 @@ class LoopConstrainer {
   // After changeIterationSpaceEnd, `Preheader' is no longer a legitimate
   // preheader because it is made to branch to the loop header only
   // conditionally.
-  //
   RewrittenRangeInfo
   changeIterationSpaceEnd(const LoopStructure &LS, BasicBlock *Preheader,
                           Value *ExitLoopAt,
@@ -585,7 +603,6 @@ class LoopConstrainer {
 
   // The loop denoted by `LS' has `OldPreheader' as its preheader.  This
   // function creates a new preheader for `LS' and returns it.
-  //
   BasicBlock *createPreheader(const LoopStructure &LS, BasicBlock *OldPreheader,
                               const char *Tag) const;
 
@@ -613,12 +630,13 @@ class LoopConstrainer {
 
   // Information about the original loop we started out with.
   Loop &OriginalLoop;
-  const SCEV *LatchTakenCount;
-  BasicBlock *OriginalPreheader;
+
+  const SCEV *LatchTakenCount = nullptr;
+  BasicBlock *OriginalPreheader = nullptr;
 
   // The preheader of the main loop.  This may or may not be different from
   // `OriginalPreheader'.
-  BasicBlock *MainLoopPreheader;
+  BasicBlock *MainLoopPreheader = nullptr;
 
   // The range we need to run the main loop in.
   InductiveRangeCheck::Range Range;
@@ -632,15 +650,14 @@ public:
                   const LoopStructure &LS, ScalarEvolution &SE,
                   DominatorTree &DT, InductiveRangeCheck::Range R)
       : F(*L.getHeader()->getParent()), Ctx(L.getHeader()->getContext()),
-        SE(SE), DT(DT), LPM(LPM), LI(LI), OriginalLoop(L),
-        LatchTakenCount(nullptr), OriginalPreheader(nullptr),
-        MainLoopPreheader(nullptr), Range(R), MainLoopStructure(LS) {}
+        SE(SE), DT(DT), LPM(LPM), LI(LI), OriginalLoop(L), Range(R),
+        MainLoopStructure(LS) {}
 
   // Entry point for the algorithm.  Returns true on success.
   bool run();
 };
 
-}
+} // end anonymous namespace
 
 void LoopConstrainer::replacePHIBlock(PHINode *PN, BasicBlock *Block,
                                       BasicBlock *ReplaceBy) {
@@ -649,22 +666,55 @@ void LoopConstrainer::replacePHIBlock(PHINode *PN, BasicBlock *Block,
       PN->setIncomingBlock(i, ReplaceBy);
 }
 
-static bool CanBeSMax(ScalarEvolution &SE, const SCEV *S) {
-  APInt SMax =
-      APInt::getSignedMaxValue(cast<IntegerType>(S->getType())->getBitWidth());
-  return SE.getSignedRange(S).contains(SMax) &&
-         SE.getUnsignedRange(S).contains(SMax);
+static bool CanBeMax(ScalarEvolution &SE, const SCEV *S, bool Signed) {
+  APInt Max = Signed ?
+      APInt::getSignedMaxValue(cast<IntegerType>(S->getType())->getBitWidth()) :
+      APInt::getMaxValue(cast<IntegerType>(S->getType())->getBitWidth());
+  return SE.getSignedRange(S).contains(Max) &&
+         SE.getUnsignedRange(S).contains(Max);
+}
+
+static bool SumCanReachMax(ScalarEvolution &SE, const SCEV *S1, const SCEV *S2,
+                           bool Signed) {
+  // S1 < INT_MAX - S2 ===> S1 + S2 < INT_MAX.
+  assert(SE.isKnownNonNegative(S2) &&
+         "We expected the 2nd arg to be non-negative!");
+  const SCEV *Max = SE.getConstant(
+      Signed ? APInt::getSignedMaxValue(
+                   cast<IntegerType>(S1->getType())->getBitWidth())
+             : APInt::getMaxValue(
+                   cast<IntegerType>(S1->getType())->getBitWidth()));
+  const SCEV *CapForS1 = SE.getMinusSCEV(Max, S2);
+  return !SE.isKnownPredicate(Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
+                              S1, CapForS1);
+}
+
+static bool CanBeMin(ScalarEvolution &SE, const SCEV *S, bool Signed) {
+  APInt Min = Signed ?
+      APInt::getSignedMinValue(cast<IntegerType>(S->getType())->getBitWidth()) :
+      APInt::getMinValue(cast<IntegerType>(S->getType())->getBitWidth());
+  return SE.getSignedRange(S).contains(Min) &&
+         SE.getUnsignedRange(S).contains(Min);
 }
 
-static bool CanBeSMin(ScalarEvolution &SE, const SCEV *S) {
-  APInt SMin =
-      APInt::getSignedMinValue(cast<IntegerType>(S->getType())->getBitWidth());
-  return SE.getSignedRange(S).contains(SMin) &&
-         SE.getUnsignedRange(S).contains(SMin);
+static bool SumCanReachMin(ScalarEvolution &SE, const SCEV *S1, const SCEV *S2,
+                           bool Signed) {
+  // S1 > INT_MIN - S2 ===> S1 + S2 > INT_MIN.
+  assert(SE.isKnownNonPositive(S2) &&
+         "We expected the 2nd arg to be non-positive!");
+  const SCEV *Max = SE.getConstant(
+      Signed ? APInt::getSignedMinValue(
+                   cast<IntegerType>(S1->getType())->getBitWidth())
+             : APInt::getMinValue(
+                   cast<IntegerType>(S1->getType())->getBitWidth()));
+  const SCEV *CapForS1 = SE.getMinusSCEV(Max, S2);
+  return !SE.isKnownPredicate(Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT,
+                              S1, CapForS1);
 }
 
 Optional<LoopStructure>
-LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BPI,
+LoopStructure::parseLoopStructure(ScalarEvolution &SE,
+                                  BranchProbabilityInfo &BPI,
                                   Loop &L, const char *&FailureReason) {
   if (!L.isLoopSimplifyForm()) {
     FailureReason = "loop not in LoopSimplify form";
@@ -766,7 +816,11 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
     return AR->getNoWrapFlags(SCEV::FlagNSW) != SCEV::FlagAnyWrap;
   };
 
-  auto IsInductionVar = [&](const SCEVAddRecExpr *AR, bool &IsIncreasing) {
+  // Here we check whether the suggested AddRec is an induction variable that
+  // can be handled (i.e. with known constant step), and if yes, calculate its
+  // step and identify whether it is increasing or decreasing.
+  auto IsInductionVar = [&](const SCEVAddRecExpr *AR, bool &IsIncreasing,
+                            ConstantInt *&StepCI) {
     if (!AR->isAffine())
       return false;
 
@@ -778,11 +832,10 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
 
     if (const SCEVConstant *StepExpr =
             dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE))) {
-      ConstantInt *StepCI = StepExpr->getValue();
-      if (StepCI->isOne() || StepCI->isMinusOne()) {
-        IsIncreasing = StepCI->isOne();
-        return true;
-      }
+      StepCI = StepExpr->getValue();
+      assert(!StepCI->isZero() && "Zero step?");
+      IsIncreasing = !StepCI->isNegative();
+      return true;
     }
 
     return false;
@@ -791,59 +844,87 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
   // `ICI` is interpreted as taking the backedge if the *next* value of the
   // induction variable satisfies some constraint.
 
-  const SCEVAddRecExpr *IndVarNext = cast<SCEVAddRecExpr>(LeftSCEV);
+  const SCEVAddRecExpr *IndVarBase = cast<SCEVAddRecExpr>(LeftSCEV);
   bool IsIncreasing = false;
-  if (!IsInductionVar(IndVarNext, IsIncreasing)) {
+  bool IsSignedPredicate = true;
+  ConstantInt *StepCI;
+  if (!IsInductionVar(IndVarBase, IsIncreasing, StepCI)) {
     FailureReason = "LHS in icmp not induction variable";
     return None;
   }
 
-  const SCEV *StartNext = IndVarNext->getStart();
-  const SCEV *Addend = SE.getNegativeSCEV(IndVarNext->getStepRecurrence(SE));
+  const SCEV *StartNext = IndVarBase->getStart();
+  const SCEV *Addend = SE.getNegativeSCEV(IndVarBase->getStepRecurrence(SE));
   const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend);
+  const SCEV *Step = SE.getSCEV(StepCI);
 
   ConstantInt *One = ConstantInt::get(IndVarTy, 1);
-  // TODO: generalize the predicates here to also match their unsigned variants.
   if (IsIncreasing) {
     bool DecreasedRightValueByOne = false;
-    // Try to turn eq/ne predicates to those we can work with.
-    if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1)
-      // while (++i != len) {         while (++i < len) {
-      //   ...                 --->     ...
-      // }                            }
-      Pred = ICmpInst::ICMP_SLT;
-    else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 &&
-             !CanBeSMin(SE, RightSCEV)) {
-      // while (true) {               while (true) {
-      //   if (++i == len)     --->     if (++i > len - 1)
-      //     break;                       break;
-      //   ...                          ...
-      // }                            }
-      Pred = ICmpInst::ICMP_SGT;
-      RightSCEV = SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType()));
-      DecreasedRightValueByOne = true;
+    if (StepCI->isOne()) {
+      // Try to turn eq/ne predicates to those we can work with.
+      if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1)
+        // while (++i != len) {         while (++i < len) {
+        //   ...                 --->     ...
+        // }                            }
+        // If both parts are known non-negative, it is profitable to use
+        // unsigned comparison in increasing loop. This allows us to make the
+        // comparison check against "RightSCEV + 1" more optimistic.
+        if (SE.isKnownNonNegative(IndVarStart) &&
+            SE.isKnownNonNegative(RightSCEV))
+          Pred = ICmpInst::ICMP_ULT;
+        else
+          Pred = ICmpInst::ICMP_SLT;
+      else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 &&
+               !CanBeMin(SE, RightSCEV, /* IsSignedPredicate */ true)) {
+        // while (true) {               while (true) {
+        //   if (++i == len)     --->     if (++i > len - 1)
+        //     break;                       break;
+        //   ...                          ...
+        // }                            }
+        // TODO: Insert ICMP_UGT if both are non-negative?
+        Pred = ICmpInst::ICMP_SGT;
+        RightSCEV = SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType()));
+        DecreasedRightValueByOne = true;
+      }
     }
 
+    bool LTPred = (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_ULT);
+    bool GTPred = (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_UGT);
     bool FoundExpectedPred =
-        (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 1) ||
-        (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 0);
+        (LTPred && LatchBrExitIdx == 1) || (GTPred && LatchBrExitIdx == 0);
 
     if (!FoundExpectedPred) {
       FailureReason = "expected icmp slt semantically, found something else";
       return None;
     }
 
+    IsSignedPredicate =
+        Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGT;
+
+    if (!IsSignedPredicate && !AllowUnsignedLatchCondition) {
+      FailureReason = "unsigned latch conditions are explicitly prohibited";
+      return None;
+    }
+
+    // The predicate that we need to check that the induction variable lies
+    // within bounds.
+    ICmpInst::Predicate BoundPred =
+        IsSignedPredicate ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;
+
     if (LatchBrExitIdx == 0) {
-      if (CanBeSMax(SE, RightSCEV)) {
+      const SCEV *StepMinusOne = SE.getMinusSCEV(Step,
+                                                 SE.getOne(Step->getType()));
+      if (SumCanReachMax(SE, RightSCEV, StepMinusOne, IsSignedPredicate)) {
         // TODO: this restriction is easily removable -- we just have to
         // remember that the icmp was an slt and not an sle.
-        FailureReason = "limit may overflow when coercing sle to slt";
+        FailureReason = "limit may overflow when coercing le to lt";
         return None;
       }
 
       if (!SE.isLoopEntryGuardedByCond(
-              &L, CmpInst::ICMP_SLT, IndVarStart,
-              SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType())))) {
+              &L, BoundPred, IndVarStart,
+              SE.getAddExpr(RightSCEV, Step))) {
         FailureReason = "Induction variable start not bounded by upper limit";
         return None;
       }
@@ -855,8 +936,7 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
         RightValue = B.CreateAdd(RightValue, One);
       }
     } else {
-      if (!SE.isLoopEntryGuardedByCond(&L, CmpInst::ICMP_SLT, IndVarStart,
-                                       RightSCEV)) {
+      if (!SE.isLoopEntryGuardedByCond(&L, BoundPred, IndVarStart, RightSCEV)) {
         FailureReason = "Induction variable start not bounded by upper limit";
         return None;
       }
@@ -865,43 +945,65 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
     }
   } else {
     bool IncreasedRightValueByOne = false;
-    // Try to turn eq/ne predicates to those we can work with.
-    if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1)
-      // while (--i != len) {         while (--i > len) {
-      //   ...                 --->     ...
-      // }                            }
-      Pred = ICmpInst::ICMP_SGT;
-    else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 &&
-             !CanBeSMax(SE, RightSCEV)) {
-      // while (true) {               while (true) {
-      //   if (--i == len)     --->     if (--i < len + 1)
-      //     break;                       break;
-      //   ...                          ...
-      // }                            }
-      Pred = ICmpInst::ICMP_SLT;
-      RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
-      IncreasedRightValueByOne = true;
+    if (StepCI->isMinusOne()) {
+      // Try to turn eq/ne predicates to those we can work with.
+      if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1)
+        // while (--i != len) {         while (--i > len) {
+        //   ...                 --->     ...
+        // }                            }
+        // We intentionally don't turn the predicate into UGT even if we know
+        // that both operands are non-negative, because it will only pessimize
+        // our check against "RightSCEV - 1".
+        Pred = ICmpInst::ICMP_SGT;
+      else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 &&
+               !CanBeMax(SE, RightSCEV, /* IsSignedPredicate */ true)) {
+        // while (true) {               while (true) {
+        //   if (--i == len)     --->     if (--i < len + 1)
+        //     break;                       break;
+        //   ...                          ...
+        // }                            }
+        // TODO: Insert ICMP_ULT if both are non-negative?
+        Pred = ICmpInst::ICMP_SLT;
+        RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
+        IncreasedRightValueByOne = true;
+      }
     }
 
+    bool LTPred = (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_ULT);
+    bool GTPred = (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_UGT);
+
     bool FoundExpectedPred =
-        (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 1) ||
-        (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 0);
+        (GTPred && LatchBrExitIdx == 1) || (LTPred && LatchBrExitIdx == 0);
 
     if (!FoundExpectedPred) {
       FailureReason = "expected icmp sgt semantically, found something else";
       return None;
     }
 
+    IsSignedPredicate =
+        Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGT;
+
+    if (!IsSignedPredicate && !AllowUnsignedLatchCondition) {
+      FailureReason = "unsigned latch conditions are explicitly prohibited";
+      return None;
+    }
+
+    // The predicate that we need to check that the induction variable lies
+    // within bounds.
+    ICmpInst::Predicate BoundPred =
+        IsSignedPredicate ? CmpInst::ICMP_SGT : CmpInst::ICMP_UGT;
+
     if (LatchBrExitIdx == 0) {
-      if (CanBeSMin(SE, RightSCEV)) {
+      const SCEV *StepPlusOne = SE.getAddExpr(Step, SE.getOne(Step->getType()));
+      if (SumCanReachMin(SE, RightSCEV, StepPlusOne, IsSignedPredicate)) {
         // TODO: this restriction is easily removable -- we just have to
         // remember that the icmp was an sgt and not an sge.
-        FailureReason = "limit may overflow when coercing sge to sgt";
+        FailureReason = "limit may overflow when coercing ge to gt";
         return None;
       }
 
       if (!SE.isLoopEntryGuardedByCond(
-              &L, CmpInst::ICMP_SGT, IndVarStart,
+              &L, BoundPred, IndVarStart,
               SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType())))) {
         FailureReason = "Induction variable start not bounded by lower limit";
         return None;
@@ -914,8 +1016,7 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
         RightValue = B.CreateSub(RightValue, One);
       }
     } else {
-      if (!SE.isLoopEntryGuardedByCond(&L, CmpInst::ICMP_SGT, IndVarStart,
-                                       RightSCEV)) {
+      if (!SE.isLoopEntryGuardedByCond(&L, BoundPred, IndVarStart, RightSCEV)) {
         FailureReason = "Induction variable start not bounded by lower limit";
         return None;
       }
@@ -923,7 +1024,6 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
              "Right value can be increased only for LatchBrExitIdx == 0!");
     }
   }
-
   BasicBlock *LatchExit = LatchBr->getSuccessor(LatchBrExitIdx);
 
   assert(SE.getLoopDisposition(LatchCount, &L) ==
@@ -946,9 +1046,11 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
   Result.LatchExit = LatchExit;
   Result.LatchBrExitIdx = LatchBrExitIdx;
   Result.IndVarStart = IndVarStartV;
-  Result.IndVarNext = LeftValue;
+  Result.IndVarStep = StepCI;
+  Result.IndVarBase = LeftValue;
   Result.IndVarIncreasing = IsIncreasing;
   Result.LoopExitAt = RightValue;
+  Result.IsSignedPredicate = IsSignedPredicate;
 
   FailureReason = nullptr;
 
@@ -956,7 +1058,7 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
 }
 
 Optional<LoopConstrainer::SubRanges>
-LoopConstrainer::calculateSubRanges() const {
+LoopConstrainer::calculateSubRanges(bool IsSignedPredicate) const {
   IntegerType *Ty = cast<IntegerType>(LatchTakenCount->getType());
 
   if (Range.getType() != Ty)
@@ -999,26 +1101,31 @@ LoopConstrainer::calculateSubRanges() const {
     //    that case, `Clamp` will always return `Smallest` and
     //    [`Result.LowLimit`, `Result.HighLimit`) = [`Smallest`, `Smallest`)
     //    will be an empty range.  Returning an empty range is always safe.
-    //
 
     Smallest = SE.getAddExpr(End, One);
     Greatest = SE.getAddExpr(Start, One);
     GreatestSeen = Start;
   }
 
-  auto Clamp = [this, Smallest, Greatest](const SCEV *S) {
-    return SE.getSMaxExpr(Smallest, SE.getSMinExpr(Greatest, S));
+  auto Clamp = [this, Smallest, Greatest, IsSignedPredicate](const SCEV *S) {
+    return IsSignedPredicate
+               ? SE.getSMaxExpr(Smallest, SE.getSMinExpr(Greatest, S))
+               : SE.getUMaxExpr(Smallest, SE.getUMinExpr(Greatest, S));
   };
 
-  // In some cases we can prove that we don't need a pre or post loop
+  // In some cases we can prove that we don't need a pre or post loop.
+  ICmpInst::Predicate PredLE =
+      IsSignedPredicate ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE;
+  ICmpInst::Predicate PredLT =
+      IsSignedPredicate ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
 
   bool ProvablyNoPreloop =
-      SE.isKnownPredicate(ICmpInst::ICMP_SLE, Range.getBegin(), Smallest);
+      SE.isKnownPredicate(PredLE, Range.getBegin(), Smallest);
   if (!ProvablyNoPreloop)
     Result.LowLimit = Clamp(Range.getBegin());
 
   bool ProvablyNoPostLoop =
-      SE.isKnownPredicate(ICmpInst::ICMP_SLT, GreatestSeen, Range.getEnd());
+      SE.isKnownPredicate(PredLT, GreatestSeen, Range.getEnd());
   if (!ProvablyNoPostLoop)
     Result.HighLimit = Clamp(Range.getEnd());
 
@@ -1082,7 +1189,6 @@ void LoopConstrainer::cloneLoop(LoopConstrainer::ClonedLoop &Result,
 LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
     const LoopStructure &LS, BasicBlock *Preheader, Value *ExitSubloopAt,
     BasicBlock *ContinuationBlock) const {
-
   // We start with a loop with a single latch:
   //
   //    +--------------------+
@@ -1153,7 +1259,6 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
   //     |   original exit    <----+
   //     |                    |
   //     +--------------------+
-  //
 
   RewrittenRangeInfo RRI;
 
@@ -1165,22 +1270,35 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
 
   BranchInst *PreheaderJump = cast<BranchInst>(Preheader->getTerminator());
   bool Increasing = LS.IndVarIncreasing;
+  bool IsSignedPredicate = LS.IsSignedPredicate;
 
   IRBuilder<> B(PreheaderJump);
 
   // EnterLoopCond - is it okay to start executing this `LS'?
-  Value *EnterLoopCond = Increasing
-                             ? B.CreateICmpSLT(LS.IndVarStart, ExitSubloopAt)
-                             : B.CreateICmpSGT(LS.IndVarStart, ExitSubloopAt);
+  Value *EnterLoopCond = nullptr;
+  if (Increasing)
+    EnterLoopCond = IsSignedPredicate
+                        ? B.CreateICmpSLT(LS.IndVarStart, ExitSubloopAt)
+                        : B.CreateICmpULT(LS.IndVarStart, ExitSubloopAt);
+  else
+    EnterLoopCond = IsSignedPredicate
+                        ? B.CreateICmpSGT(LS.IndVarStart, ExitSubloopAt)
+                        : B.CreateICmpUGT(LS.IndVarStart, ExitSubloopAt);
 
   B.CreateCondBr(EnterLoopCond, LS.Header, RRI.PseudoExit);
   PreheaderJump->eraseFromParent();
 
   LS.LatchBr->setSuccessor(LS.LatchBrExitIdx, RRI.ExitSelector);
   B.SetInsertPoint(LS.LatchBr);
-  Value *TakeBackedgeLoopCond =
-      Increasing ? B.CreateICmpSLT(LS.IndVarNext, ExitSubloopAt)
-                 : B.CreateICmpSGT(LS.IndVarNext, ExitSubloopAt);
+  Value *TakeBackedgeLoopCond = nullptr;
+  if (Increasing)
+    TakeBackedgeLoopCond = IsSignedPredicate
+                        ? B.CreateICmpSLT(LS.IndVarBase, ExitSubloopAt)
+                        : B.CreateICmpULT(LS.IndVarBase, ExitSubloopAt);
+  else
+    TakeBackedgeLoopCond = IsSignedPredicate
+                        ? B.CreateICmpSGT(LS.IndVarBase, ExitSubloopAt)
+                        : B.CreateICmpUGT(LS.IndVarBase, ExitSubloopAt);
   Value *CondForBranch = LS.LatchBrExitIdx == 1
                              ? TakeBackedgeLoopCond
                              : B.CreateNot(TakeBackedgeLoopCond);
@@ -1192,9 +1310,15 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
   // IterationsLeft - are there any more iterations left, given the original
   // upper bound on the induction variable?  If not, we branch to the "real"
   // exit.
-  Value *IterationsLeft = Increasing
-                              ? B.CreateICmpSLT(LS.IndVarNext, LS.LoopExitAt)
-                              : B.CreateICmpSGT(LS.IndVarNext, LS.LoopExitAt);
+  Value *IterationsLeft = nullptr;
+  if (Increasing)
+    IterationsLeft = IsSignedPredicate
+                         ? B.CreateICmpSLT(LS.IndVarBase, LS.LoopExitAt)
+                         : B.CreateICmpULT(LS.IndVarBase, LS.LoopExitAt);
+  else
+    IterationsLeft = IsSignedPredicate
+                         ? B.CreateICmpSGT(LS.IndVarBase, LS.LoopExitAt)
+                         : B.CreateICmpUGT(LS.IndVarBase, LS.LoopExitAt);
   B.CreateCondBr(IterationsLeft, RRI.PseudoExit, LS.LatchExit);
 
   BranchInst *BranchToContinuation =
@@ -1217,10 +1341,10 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
     RRI.PHIValuesAtPseudoExit.push_back(NewPHI);
   }
 
-  RRI.IndVarEnd = PHINode::Create(LS.IndVarNext->getType(), 2, "indvar.end",
+  RRI.IndVarEnd = PHINode::Create(LS.IndVarBase->getType(), 2, "indvar.end",
                                   BranchToContinuation);
   RRI.IndVarEnd->addIncoming(LS.IndVarStart, Preheader);
-  RRI.IndVarEnd->addIncoming(LS.IndVarNext, RRI.ExitSelector);
+  RRI.IndVarEnd->addIncoming(LS.IndVarBase, RRI.ExitSelector);
 
   // The latch exit now has a branch from `RRI.ExitSelector' instead of
   // `LS.Latch'.  The PHI nodes need to be updated to reflect that.
@@ -1237,7 +1361,6 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
 void LoopConstrainer::rewriteIncomingValuesForPHIs(
     LoopStructure &LS, BasicBlock *ContinuationBlock,
     const LoopConstrainer::RewrittenRangeInfo &RRI) const {
-
   unsigned PHIIndex = 0;
   for (Instruction &I : *LS.Header) {
     auto *PN = dyn_cast<PHINode>(&I);
@@ -1255,7 +1378,6 @@ void LoopConstrainer::rewriteIncomingValuesForPHIs(
 BasicBlock *LoopConstrainer::createPreheader(const LoopStructure &LS,
                                              BasicBlock *OldPreheader,
                                              const char *Tag) const {
-
   BasicBlock *Preheader = BasicBlock::Create(Ctx, Tag, &F, LS.Header);
   BranchInst::Create(LS.Header, Preheader);
 
@@ -1282,7 +1404,7 @@ void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs) {
 
 Loop *LoopConstrainer::createClonedLoopStructure(Loop *Original, Loop *Parent,
                                                  ValueToValueMapTy &VM) {
-  Loop &New = *new Loop();
+  Loop &New = *LI.AllocateLoop();
   if (Parent)
     Parent->addChildLoop(&New);
   else
@@ -1311,7 +1433,8 @@ bool LoopConstrainer::run() {
   OriginalPreheader = Preheader;
   MainLoopPreheader = Preheader;
 
-  Optional<SubRanges> MaybeSR = calculateSubRanges();
+  bool IsSignedPredicate = MainLoopStructure.IsSignedPredicate;
+  Optional<SubRanges> MaybeSR = calculateSubRanges(IsSignedPredicate);
   if (!MaybeSR.hasValue()) {
     DEBUG(dbgs() << "irce: could not compute subranges\n");
     return false;
@@ -1320,7 +1443,7 @@ bool LoopConstrainer::run() {
   SubRanges SR = MaybeSR.getValue();
   bool Increasing = MainLoopStructure.IndVarIncreasing;
   IntegerType *IVTy =
-      cast<IntegerType>(MainLoopStructure.IndVarNext->getType());
+      cast<IntegerType>(MainLoopStructure.IndVarBase->getType());
 
   SCEVExpander Expander(SE, F.getParent()->getDataLayout(), "irce");
   Instruction *InsertPt = OriginalPreheader->getTerminator();
@@ -1345,7 +1468,7 @@ bool LoopConstrainer::run() {
     if (Increasing)
       ExitPreLoopAtSCEV = *SR.LowLimit;
     else {
-      if (CanBeSMin(SE, *SR.HighLimit)) {
+      if (CanBeMin(SE, *SR.HighLimit, IsSignedPredicate)) {
         DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
                      << "preloop exit limit.  HighLimit = " << *(*SR.HighLimit)
                      << "\n");
@@ -1354,6 +1477,13 @@ bool LoopConstrainer::run() {
       ExitPreLoopAtSCEV = SE.getAddExpr(*SR.HighLimit, MinusOneS);
     }
 
+    if (!isSafeToExpandAt(ExitPreLoopAtSCEV, InsertPt, SE)) {
+      DEBUG(dbgs() << "irce: could not prove that it is safe to expand the"
+                   << " preloop exit limit " << *ExitPreLoopAtSCEV
+                   << " at block " << InsertPt->getParent()->getName() << "\n");
+      return false;
+    }
+
     ExitPreLoopAt = Expander.expandCodeFor(ExitPreLoopAtSCEV, IVTy, InsertPt);
     ExitPreLoopAt->setName("exit.preloop.at");
   }
@@ -1364,7 +1494,7 @@ bool LoopConstrainer::run() {
     if (Increasing)
       ExitMainLoopAtSCEV = *SR.HighLimit;
     else {
-      if (CanBeSMin(SE, *SR.LowLimit)) {
+      if (CanBeMin(SE, *SR.LowLimit, IsSignedPredicate)) {
         DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
                      << "mainloop exit limit.  LowLimit = " << *(*SR.LowLimit)
                      << "\n");
@@ -1373,6 +1503,13 @@ bool LoopConstrainer::run() {
       ExitMainLoopAtSCEV = SE.getAddExpr(*SR.LowLimit, MinusOneS);
     }
 
+    if (!isSafeToExpandAt(ExitMainLoopAtSCEV, InsertPt, SE)) {
+      DEBUG(dbgs() << "irce: could not prove that it is safe to expand the"
+                   << " main loop exit limit " << *ExitMainLoopAtSCEV
+                   << " at block " << InsertPt->getParent()->getName() << "\n");
+      return false;
+    }
+
     ExitMainLoopAt = Expander.expandCodeFor(ExitMainLoopAtSCEV, IVTy, InsertPt);
     ExitMainLoopAt->setName("exit.mainloop.at");
   }
@@ -1463,34 +1600,27 @@ bool LoopConstrainer::run() {
 /// range, returns None.
 Optional<InductiveRangeCheck::Range>
 InductiveRangeCheck::computeSafeIterationSpace(
-    ScalarEvolution &SE, const SCEVAddRecExpr *IndVar) const {
+    ScalarEvolution &SE, const SCEVAddRecExpr *IndVar,
+    bool IsLatchSigned) const {
   // IndVar is of the form "A + B * I" (where "I" is the canonical induction
   // variable, that may or may not exist as a real llvm::Value in the loop) and
   // this inductive range check is a range check on the "C + D * I" ("C" is
-  // getOffset() and "D" is getScale()).  We rewrite the value being range
+  // getBegin() and "D" is getStep()).  We rewrite the value being range
   // checked to "M + N * IndVar" where "N" = "D * B^(-1)" and "M" = "C - NA".
-  // Currently we support this only for "B" = "D" = { 1 or -1 }, but the code
-  // can be generalized as needed.
   //
   // The actual inequalities we solve are of the form
   //
   //   0 <= M + 1 * IndVar < L given L >= 0  (i.e. N == 1)
   //
-  // The inequality is satisfied by -M <= IndVar < (L - M) [^1].  All additions
-  // and subtractions are twos-complement wrapping and comparisons are signed.
-  //
-  // Proof:
-  //
-  //   If there exists IndVar such that -M <= IndVar < (L - M) then it follows
-  //   that -M <= (-M + L) [== Eq. 1].  Since L >= 0, if (-M + L) sign-overflows
-  //   then (-M + L) < (-M).  Hence by [Eq. 1], (-M + L) could not have
-  //   overflown.
-  //
-  //   This means IndVar = t + (-M) for t in [0, L).  Hence (IndVar + M) = t.
-  //   Hence 0 <= (IndVar + M) < L
-
-  // [^1]: Note that the solution does _not_ apply if L < 0; consider values M =
-  // 127, IndVar = 126 and L = -2 in an i8 world.
+  // Here L stands for upper limit of the safe iteration space.
+  // The inequality is satisfied by (0 - M) <= IndVar < (L - M). To avoid
+  // overflows when calculating (0 - M) and (L - M) we, depending on type of
+  // IV's iteration space, limit the calculations by borders of the iteration
+  // space. For example, if IndVar is unsigned, (0 - M) overflows for any M > 0.
+  // If we figured out that "anything greater than (-M) is safe", we strengthen
+  // this to "everything greater than 0 is safe", assuming that values between
+  // -M and 0 just do not exist in unsigned iteration space, and we don't want
+  // to deal with overflown values.
 
   if (!IndVar->isAffine())
     return None;
@@ -1499,42 +1629,89 @@ InductiveRangeCheck::computeSafeIterationSpace(
   const SCEVConstant *B = dyn_cast<SCEVConstant>(IndVar->getStepRecurrence(SE));
   if (!B)
     return None;
+  assert(!B->isZero() && "Recurrence with zero step?");
 
-  const SCEV *C = getOffset();
-  const SCEVConstant *D = dyn_cast<SCEVConstant>(getScale());
+  const SCEV *C = getBegin();
+  const SCEVConstant *D = dyn_cast<SCEVConstant>(getStep());
   if (D != B)
     return None;
 
-  ConstantInt *ConstD = D->getValue();
-  if (!(ConstD->isMinusOne() || ConstD->isOne()))
-    return None;
+  assert(!D->getValue()->isZero() && "Recurrence with zero step?");
+  unsigned BitWidth = cast<IntegerType>(IndVar->getType())->getBitWidth();
+  const SCEV *SIntMax = SE.getConstant(APInt::getSignedMaxValue(BitWidth));
 
+  // Substract Y from X so that it does not go through border of the IV
+  // iteration space. Mathematically, it is equivalent to:
+  //
+  //    ClampedSubstract(X, Y) = min(max(X - Y, INT_MIN), INT_MAX).        [1]
+  //
+  // In [1], 'X - Y' is a mathematical substraction (result is not bounded to
+  // any width of bit grid). But after we take min/max, the result is
+  // guaranteed to be within [INT_MIN, INT_MAX].
+  //
+  // In [1], INT_MAX and INT_MIN are respectively signed and unsigned max/min
+  // values, depending on type of latch condition that defines IV iteration
+  // space.
+  auto ClampedSubstract = [&](const SCEV *X, const SCEV *Y) {
+    assert(SE.isKnownNonNegative(X) &&
+           "We can only substract from values in [0; SINT_MAX]!");
+    if (IsLatchSigned) {
+      // X is a number from signed range, Y is interpreted as signed.
+      // Even if Y is SINT_MAX, (X - Y) does not reach SINT_MIN. So the only
+      // thing we should care about is that we didn't cross SINT_MAX.
+      // So, if Y is positive, we substract Y safely.
+      //   Rule 1: Y > 0 ---> Y.
+      // If 0 <= -Y <= (SINT_MAX - X), we substract Y safely.
+      //   Rule 2: Y >=s (X - SINT_MAX) ---> Y.
+      // If 0 <= (SINT_MAX - X) < -Y, we can only substract (X - SINT_MAX).
+      //   Rule 3: Y <s (X - SINT_MAX) ---> (X - SINT_MAX).
+      // It gives us smax(Y, X - SINT_MAX) to substract in all cases.
+      const SCEV *XMinusSIntMax = SE.getMinusSCEV(X, SIntMax);
+      return SE.getMinusSCEV(X, SE.getSMaxExpr(Y, XMinusSIntMax),
+                             SCEV::FlagNSW);
+    } else
+      // X is a number from unsigned range, Y is interpreted as signed.
+      // Even if Y is SINT_MIN, (X - Y) does not reach UINT_MAX. So the only
+      // thing we should care about is that we didn't cross zero.
+      // So, if Y is negative, we substract Y safely.
+      //   Rule 1: Y <s 0 ---> Y.
+      // If 0 <= Y <= X, we substract Y safely.
+      //   Rule 2: Y <=s X ---> Y.
+      // If 0 <= X < Y, we should stop at 0 and can only substract X.
+      //   Rule 3: Y >s X ---> X.
+      // It gives us smin(X, Y) to substract in all cases.
+      return SE.getMinusSCEV(X, SE.getSMinExpr(X, Y), SCEV::FlagNUW);
+  };
   const SCEV *M = SE.getMinusSCEV(C, A);
-
-  const SCEV *Begin = SE.getNegativeSCEV(M);
-  const SCEV *UpperLimit = nullptr;
+  const SCEV *Zero = SE.getZero(M->getType());
+  const SCEV *Begin = ClampedSubstract(Zero, M);
+  const SCEV *L = nullptr;
 
   // We strengthen "0 <= I" to "0 <= I < INT_SMAX" and "I < L" to "0 <= I < L".
   // We can potentially do much better here.
-  if (Value *V = getLength()) {
-    UpperLimit = SE.getSCEV(V);
-  } else {
+  if (const SCEV *EndLimit = getEnd())
+    L = EndLimit;
+  else {
     assert(Kind == InductiveRangeCheck::RANGE_CHECK_LOWER && "invariant!");
-    unsigned BitWidth = cast<IntegerType>(IndVar->getType())->getBitWidth();
-    UpperLimit = SE.getConstant(APInt::getSignedMaxValue(BitWidth));
+    L = SIntMax;
   }
-
-  const SCEV *End = SE.getMinusSCEV(UpperLimit, M);
+  const SCEV *End = ClampedSubstract(L, M);
   return InductiveRangeCheck::Range(Begin, End);
 }
 
 static Optional<InductiveRangeCheck::Range>
-IntersectRange(ScalarEvolution &SE,
-               const Optional<InductiveRangeCheck::Range> &R1,
-               const InductiveRangeCheck::Range &R2) {
+IntersectSignedRange(ScalarEvolution &SE,
+                     const Optional<InductiveRangeCheck::Range> &R1,
+                     const InductiveRangeCheck::Range &R2) {
+  if (R2.isEmpty(SE, /* IsSigned */ true))
+    return None;
   if (!R1.hasValue())
     return R2;
   auto &R1Value = R1.getValue();
+  // We never return empty ranges from this function, and R1 is supposed to be
+  // a result of intersection. Thus, R1 is never empty.
+  assert(!R1Value.isEmpty(SE, /* IsSigned */ true) &&
+         "We should never have empty R1!");
 
   // TODO: we could widen the smaller range and have this work; but for now we
   // bail out to keep things simple.
@@ -1544,7 +1721,40 @@ IntersectRange(ScalarEvolution &SE,
   const SCEV *NewBegin = SE.getSMaxExpr(R1Value.getBegin(), R2.getBegin());
   const SCEV *NewEnd = SE.getSMinExpr(R1Value.getEnd(), R2.getEnd());
 
-  return InductiveRangeCheck::Range(NewBegin, NewEnd);
+  // If the resulting range is empty, just return None.
+  auto Ret = InductiveRangeCheck::Range(NewBegin, NewEnd);
+  if (Ret.isEmpty(SE, /* IsSigned */ true))
+    return None;
+  return Ret;
+}
+
+static Optional<InductiveRangeCheck::Range>
+IntersectUnsignedRange(ScalarEvolution &SE,
+                       const Optional<InductiveRangeCheck::Range> &R1,
+                       const InductiveRangeCheck::Range &R2) {
+  if (R2.isEmpty(SE, /* IsSigned */ false))
+    return None;
+  if (!R1.hasValue())
+    return R2;
+  auto &R1Value = R1.getValue();
+  // We never return empty ranges from this function, and R1 is supposed to be
+  // a result of intersection. Thus, R1 is never empty.
+  assert(!R1Value.isEmpty(SE, /* IsSigned */ false) &&
+         "We should never have empty R1!");
+
+  // TODO: we could widen the smaller range and have this work; but for now we
+  // bail out to keep things simple.
+  if (R1Value.getType() != R2.getType())
+    return None;
+
+  const SCEV *NewBegin = SE.getUMaxExpr(R1Value.getBegin(), R2.getBegin());
+  const SCEV *NewEnd = SE.getUMinExpr(R1Value.getEnd(), R2.getEnd());
+
+  // If the resulting range is empty, just return None.
+  auto Ret = InductiveRangeCheck::Range(NewBegin, NewEnd);
+  if (Ret.isEmpty(SE, /* IsSigned */ false))
+    return None;
+  return Ret;
 }
 
 bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
@@ -1598,24 +1808,31 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
     return false;
   }
   LoopStructure LS = MaybeLoopStructure.getValue();
-  bool Increasing = LS.IndVarIncreasing;
-  const SCEV *MinusOne =
-      SE.getConstant(LS.IndVarNext->getType(), Increasing ? -1 : 1, true);
   const SCEVAddRecExpr *IndVar =
-      cast<SCEVAddRecExpr>(SE.getAddExpr(SE.getSCEV(LS.IndVarNext), MinusOne));
+      cast<SCEVAddRecExpr>(SE.getMinusSCEV(SE.getSCEV(LS.IndVarBase), SE.getSCEV(LS.IndVarStep)));
 
   Optional<InductiveRangeCheck::Range> SafeIterRange;
   Instruction *ExprInsertPt = Preheader->getTerminator();
 
   SmallVector<InductiveRangeCheck, 4> RangeChecksToEliminate;
+  // Basing on the type of latch predicate, we interpret the IV iteration range
+  // as signed or unsigned range. We use different min/max functions (signed or
+  // unsigned) when intersecting this range with safe iteration ranges implied
+  // by range checks.
+  auto IntersectRange =
+      LS.IsSignedPredicate ? IntersectSignedRange : IntersectUnsignedRange;
 
   IRBuilder<> B(ExprInsertPt);
   for (InductiveRangeCheck &IRC : RangeChecks) {
-    auto Result = IRC.computeSafeIterationSpace(SE, IndVar);
+    auto Result = IRC.computeSafeIterationSpace(SE, IndVar,
+                                                LS.IsSignedPredicate);
     if (Result.hasValue()) {
       auto MaybeSafeIterRange =
           IntersectRange(SE, SafeIterRange, Result.getValue());
       if (MaybeSafeIterRange.hasValue()) {
+        assert(
+            !MaybeSafeIterRange.getValue().isEmpty(SE, LS.IsSignedPredicate) &&
+            "We should never return empty ranges!");
         RangeChecksToEliminate.push_back(IRC);
         SafeIterRange = MaybeSafeIterRange.getValue();
       }
diff --git a/lib/Transforms/Scalar/InferAddressSpaces.cpp b/lib/Transforms/Scalar/InferAddressSpaces.cpp
index 89b28f0aeee6..7d66c0f73821 100644
--- a/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -1,4 +1,4 @@
-//===-- NVPTXInferAddressSpace.cpp - ---------------------*- C++ -*-===//
+//===- InferAddressSpace.cpp - --------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -89,26 +89,54 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
+#include <cassert>
+#include <iterator>
+#include <limits>
+#include <utility>
+#include <vector>
 
 #define DEBUG_TYPE "infer-address-spaces"
 
 using namespace llvm;
 
+static const unsigned UninitializedAddressSpace =
+    std::numeric_limits<unsigned>::max();
+
 namespace {
-static const unsigned UninitializedAddressSpace = ~0u;
 
 using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>;
 
@@ -146,10 +174,9 @@ private:
   // Changes the flat address expressions in function F to point to specific
   // address spaces if InferredAddrSpace says so. Postorder is the postorder of
   // all flat expressions in the use-def graph of function F.
-  bool
-  rewriteWithNewAddressSpaces(ArrayRef<WeakTrackingVH> Postorder,
-                              const ValueToAddrSpaceMapTy &InferredAddrSpace,
-                              Function *F) const;
+  bool rewriteWithNewAddressSpaces(
+      const TargetTransformInfo &TTI, ArrayRef<WeakTrackingVH> Postorder,
+      const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const;
 
   void appendsFlatAddressExpressionToPostorderStack(
     Value *V, std::vector<std::pair<Value *, bool>> &PostorderStack,
@@ -170,13 +197,16 @@ private:
     SmallVectorImpl<const Use *> *UndefUsesToFix) const;
   unsigned joinAddressSpaces(unsigned AS1, unsigned AS2) const;
 };
+
 } // end anonymous namespace
 
 char InferAddressSpaces::ID = 0;
 
 namespace llvm {
+
 void initializeInferAddressSpacesPass(PassRegistry &);
-}
+
+} // end namespace llvm
 
 INITIALIZE_PASS(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces",
                 false, false)
@@ -454,11 +484,10 @@ static Value *cloneInstructionWithNewAddressSpace(
     NewGEP->setIsInBounds(GEP->isInBounds());
     return NewGEP;
   }
-  case Instruction::Select: {
+  case Instruction::Select:
     assert(I->getType()->isPointerTy());
     return SelectInst::Create(I->getOperand(0), NewPointerOperands[1],
                               NewPointerOperands[2], "", nullptr, I);
-  }
   default:
     llvm_unreachable("Unexpected opcode");
   }
@@ -600,7 +629,7 @@ bool InferAddressSpaces::runOnFunction(Function &F) {
 
   // Changes the address spaces of the flat address expressions who are inferred
   // to point to a specific address space.
-  return rewriteWithNewAddressSpaces(Postorder, InferredAddrSpace, &F);
+  return rewriteWithNewAddressSpaces(TTI, Postorder, InferredAddrSpace, &F);
 }
 
 // Constants need to be tracked through RAUW to handle cases with nested
@@ -708,24 +737,32 @@ Optional<unsigned> InferAddressSpaces::updateAddressSpace(
 
 /// \p returns true if \p U is the pointer operand of a memory instruction with
 /// a single pointer operand that can have its address space changed by simply
-/// mutating the use to a new value.
-static bool isSimplePointerUseValidToReplace(Use &U) {
+/// mutating the use to a new value. If the memory instruction is volatile,
+/// return true only if the target allows the memory instruction to be volatile
+/// in the new address space.
+static bool isSimplePointerUseValidToReplace(const TargetTransformInfo &TTI,
+                                             Use &U, unsigned AddrSpace) {
   User *Inst = U.getUser();
   unsigned OpNo = U.getOperandNo();
+  bool VolatileIsAllowed = false;
+  if (auto *I = dyn_cast<Instruction>(Inst))
+    VolatileIsAllowed = TTI.hasVolatileVariant(I, AddrSpace);
 
   if (auto *LI = dyn_cast<LoadInst>(Inst))
-    return OpNo == LoadInst::getPointerOperandIndex() && !LI->isVolatile();
+    return OpNo == LoadInst::getPointerOperandIndex() &&
+           (VolatileIsAllowed || !LI->isVolatile());
 
   if (auto *SI = dyn_cast<StoreInst>(Inst))
-    return OpNo == StoreInst::getPointerOperandIndex() && !SI->isVolatile();
+    return OpNo == StoreInst::getPointerOperandIndex() &&
+           (VolatileIsAllowed || !SI->isVolatile());
 
   if (auto *RMW = dyn_cast<AtomicRMWInst>(Inst))
-    return OpNo == AtomicRMWInst::getPointerOperandIndex() && !RMW->isVolatile();
+    return OpNo == AtomicRMWInst::getPointerOperandIndex() &&
+           (VolatileIsAllowed || !RMW->isVolatile());
 
-  if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+  if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst))
     return OpNo == AtomicCmpXchgInst::getPointerOperandIndex() &&
-           !CmpX->isVolatile();
-  }
+           (VolatileIsAllowed || !CmpX->isVolatile());
 
   return false;
 }
@@ -818,7 +855,7 @@ static Value::use_iterator skipToNextUser(Value::use_iterator I,
 }
 
 bool InferAddressSpaces::rewriteWithNewAddressSpaces(
-    ArrayRef<WeakTrackingVH> Postorder,
+    const TargetTransformInfo &TTI, ArrayRef<WeakTrackingVH> Postorder,
     const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const {
   // For each address expression to be modified, creates a clone of it with its
   // pointer operands converted to the new address space. Since the pointer
@@ -878,7 +915,8 @@ bool InferAddressSpaces::rewriteWithNewAddressSpaces(
       // to the next instruction.
       I = skipToNextUser(I, E);
 
-      if (isSimplePointerUseValidToReplace(U)) {
+      if (isSimplePointerUseValidToReplace(
+              TTI, U, V->getType()->getPointerAddressSpace())) {
         // If V is used as the pointer operand of a compatible memory operation,
         // sets the pointer operand to NewV. This replacement does not change
         // the element type, so the resultant load/store is still valid.
@@ -933,6 +971,11 @@ bool InferAddressSpaces::rewriteWithNewAddressSpaces(
         if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(CurUser)) {
           unsigned NewAS = NewV->getType()->getPointerAddressSpace();
           if (ASC->getDestAddressSpace() == NewAS) {
+            if (ASC->getType()->getPointerElementType() !=
+                NewV->getType()->getPointerElementType()) {
+              NewV = CastInst::Create(Instruction::BitCast, NewV,
+                                      ASC->getType(), "", ASC);
+            }
             ASC->replaceAllUsesWith(NewV);
             DeadInstructions.push_back(ASC);
             continue;
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index dc9143bebc45..6b0377e0ecb3 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -14,25 +14,50 @@
 #include "llvm/Transforms/Scalar/JumpThreading.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -41,8 +66,15 @@
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
 #include <memory>
+#include <utility>
+
 using namespace llvm;
 using namespace jumpthreading;
 
@@ -70,6 +102,7 @@ static cl::opt<bool> PrintLVIAfterJumpThreading(
     cl::Hidden);
 
 namespace {
+
   /// This pass performs 'jump threading', which looks at blocks that have
   /// multiple predecessors and multiple successors.  If one or more of the
   /// predecessors of the block can be proven to always jump to one of the
@@ -85,12 +118,12 @@ namespace {
   ///
   /// In this case, the unconditional branch at the end of the first if can be
   /// revectored to the false side of the second if.
-  ///
   class JumpThreading : public FunctionPass {
     JumpThreadingPass Impl;
 
   public:
     static char ID; // Pass identification
+
     JumpThreading(int T = -1) : FunctionPass(ID), Impl(T) {
       initializeJumpThreadingPass(*PassRegistry::getPassRegistry());
     }
@@ -108,9 +141,11 @@ namespace {
 
     void releaseMemory() override { Impl.releaseMemory(); }
   };
-}
+
+} // end anonymous namespace
 
 char JumpThreading::ID = 0;
+
 INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading",
                 "Jump Threading", false, false)
 INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
@@ -120,14 +155,125 @@ INITIALIZE_PASS_END(JumpThreading, "jump-threading",
                 "Jump Threading", false, false)
 
 // Public interface to the Jump Threading pass
-FunctionPass *llvm::createJumpThreadingPass(int Threshold) { return new JumpThreading(Threshold); }
+FunctionPass *llvm::createJumpThreadingPass(int Threshold) {
+  return new JumpThreading(Threshold);
+}
 
 JumpThreadingPass::JumpThreadingPass(int T) {
   BBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T);
 }
 
-/// runOnFunction - Top level algorithm.
-///
+// Update branch probability information according to conditional
+// branch probablity. This is usually made possible for cloned branches
+// in inline instances by the context specific profile in the caller.
+// For instance,
+//
+//  [Block PredBB]
+//  [Branch PredBr]
+//  if (t) {
+//     Block A;
+//  } else {
+//     Block B;
+//  }
+//
+//  [Block BB]
+//  cond = PN([true, %A], [..., %B]); // PHI node
+//  [Branch CondBr]
+//  if (cond) {
+//    ...  // P(cond == true) = 1%
+//  }
+//
+//  Here we know that when block A is taken, cond must be true, which means
+//      P(cond == true | A) = 1
+//
+//  Given that P(cond == true) = P(cond == true | A) * P(A) +
+//                               P(cond == true | B) * P(B)
+//  we get:
+//     P(cond == true ) = P(A) + P(cond == true | B) * P(B)
+//
+//  which gives us:
+//     P(A) is less than P(cond == true), i.e.
+//     P(t == true) <= P(cond == true)
+//
+//  In other words, if we know P(cond == true) is unlikely, we know
+//  that P(t == true) is also unlikely.
+//
+static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) {
+  BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
+  if (!CondBr)
+    return;
+
+  BranchProbability BP;
+  uint64_t TrueWeight, FalseWeight;
+  if (!CondBr->extractProfMetadata(TrueWeight, FalseWeight))
+    return;
+
+  // Returns the outgoing edge of the dominating predecessor block
+  // that leads to the PhiNode's incoming block:
+  auto GetPredOutEdge =
+      [](BasicBlock *IncomingBB,
+         BasicBlock *PhiBB) -> std::pair<BasicBlock *, BasicBlock *> {
+    auto *PredBB = IncomingBB;
+    auto *SuccBB = PhiBB;
+    while (true) {
+      BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator());
+      if (PredBr && PredBr->isConditional())
+        return {PredBB, SuccBB};
+      auto *SinglePredBB = PredBB->getSinglePredecessor();
+      if (!SinglePredBB)
+        return {nullptr, nullptr};
+      SuccBB = PredBB;
+      PredBB = SinglePredBB;
+    }
+  };
+
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    Value *PhiOpnd = PN->getIncomingValue(i);
+    ConstantInt *CI = dyn_cast<ConstantInt>(PhiOpnd);
+
+    if (!CI || !CI->getType()->isIntegerTy(1))
+      continue;
+
+    BP = (CI->isOne() ? BranchProbability::getBranchProbability(
+                            TrueWeight, TrueWeight + FalseWeight)
+                      : BranchProbability::getBranchProbability(
+                            FalseWeight, TrueWeight + FalseWeight));
+
+    auto PredOutEdge = GetPredOutEdge(PN->getIncomingBlock(i), BB);
+    if (!PredOutEdge.first)
+      return;
+
+    BasicBlock *PredBB = PredOutEdge.first;
+    BranchInst *PredBr = cast<BranchInst>(PredBB->getTerminator());
+
+    uint64_t PredTrueWeight, PredFalseWeight;
+    // FIXME: We currently only set the profile data when it is missing.
+    // With PGO, this can be used to refine even existing profile data with
+    // context information. This needs to be done after more performance
+    // testing.
+    if (PredBr->extractProfMetadata(PredTrueWeight, PredFalseWeight))
+      continue;
+
+    // We can not infer anything useful when BP >= 50%, because BP is the
+    // upper bound probability value.
+    if (BP >= BranchProbability(50, 100))
+      continue;
+
+    SmallVector<uint32_t, 2> Weights;
+    if (PredBr->getSuccessor(0) == PredOutEdge.second) {
+      Weights.push_back(BP.getNumerator());
+      Weights.push_back(BP.getCompl().getNumerator());
+    } else {
+      Weights.push_back(BP.getCompl().getNumerator());
+      Weights.push_back(BP.getNumerator());
+    }
+    PredBr->setMetadata(LLVMContext::MD_prof,
+                        MDBuilder(PredBr->getParent()->getContext())
+                            .createBranchWeights(Weights));
+  }
+}
+
+/// runOnFunction - Toplevel algorithm.
 bool JumpThreading::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
@@ -155,7 +301,6 @@ bool JumpThreading::runOnFunction(Function &F) {
 
 PreservedAnalyses JumpThreadingPass::run(Function &F,
                                          FunctionAnalysisManager &AM) {
-
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &LVI = AM.getResult<LazyValueAnalysis>(F);
   auto &AA = AM.getResult<AAManager>(F);
@@ -184,7 +329,6 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
                                 bool HasProfileData_,
                                 std::unique_ptr<BlockFrequencyInfo> BFI_,
                                 std::unique_ptr<BranchProbabilityInfo> BPI_) {
-
   DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
   TLI = TLI_;
   LVI = LVI_;
@@ -384,7 +528,6 @@ static unsigned getJumpThreadDuplicationCost(BasicBlock *BB,
 /// within the loop (forming a nested loop).  This simple analysis is not rich
 /// enough to track all of these properties and keep it up-to-date as the CFG
 /// mutates, so we don't allow any of these transformations.
-///
 void JumpThreadingPass::FindLoopHeaders(Function &F) {
   SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> Edges;
   FindFunctionBackedges(F, Edges);
@@ -418,7 +561,6 @@ static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) {
 /// BB in the result vector.
 ///
 /// This returns true if there were any known values.
-///
 bool JumpThreadingPass::ComputeValueKnownInPredecessors(
     Value *V, BasicBlock *BB, PredValueInfo &Result,
     ConstantPreference Preference, Instruction *CxtI) {
@@ -507,8 +649,6 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
     return true;
   }
 
-  PredValueInfoTy LHSVals, RHSVals;
-
   // Handle some boolean conditions.
   if (I->getType()->getPrimitiveSizeInBits() == 1) {
     assert(Preference == WantInteger && "One-bit non-integer type?");
@@ -516,6 +656,8 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
     // X & false -> false
     if (I->getOpcode() == Instruction::Or ||
         I->getOpcode() == Instruction::And) {
+      PredValueInfoTy LHSVals, RHSVals;
+
       ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals,
                                       WantInteger, CxtI);
       ComputeValueKnownInPredecessors(I->getOperand(1), BB, RHSVals,
@@ -655,6 +797,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
       // x as a live-in.
       {
         using namespace PatternMatch;
+
         Value *AddLHS;
         ConstantInt *AddConst;
         if (isa<ConstantInt>(CmpConst) &&
@@ -751,14 +894,11 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
   return !Result.empty();
 }
 
-
-
 /// GetBestDestForBranchOnUndef - If we determine that the specified block ends
 /// in an undefined jump, decide which block is best to revector to.
 ///
 /// Since we can pick an arbitrary destination, we pick the successor with the
 /// fewest predecessors.  This should reduce the in-degree of the others.
-///
 static unsigned GetBestDestForJumpOnUndef(BasicBlock *BB) {
   TerminatorInst *BBTerm = BB->getTerminator();
   unsigned MinSucc = 0;
@@ -979,7 +1119,6 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
   // for loads that are used by a switch or by the condition for the branch.  If
   // we see one, check to see if it's partially redundant.  If so, insert a PHI
   // which can then be used to thread the values.
-  //
   Value *SimplifyValue = CondInst;
   if (CmpInst *CondCmp = dyn_cast<CmpInst>(SimplifyValue))
     if (isa<Constant>(CondCmp->getOperand(1)))
@@ -991,10 +1130,14 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
     if (SimplifyPartiallyRedundantLoad(LI))
       return true;
 
+  // Before threading, try to propagate profile data backwards:
+  if (PHINode *PN = dyn_cast<PHINode>(CondInst))
+    if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
+      updatePredecessorProfileMetadata(PN, BB);
+
   // Handle a variety of cases where we are branching on something derived from
   // a PHI node in the current block.  If we can prove that any predecessors
   // compute a predictable value based on a PHI node, thread those predecessors.
-  //
   if (ProcessThreadableEdges(CondInst, BB, Preference, Terminator))
     return true;
 
@@ -1036,9 +1179,9 @@ bool JumpThreadingPass::ProcessImpliedCondition(BasicBlock *BB) {
     if (PBI->getSuccessor(0) != CurrentBB && PBI->getSuccessor(1) != CurrentBB)
       return false;
 
-    bool FalseDest = PBI->getSuccessor(1) == CurrentBB;
+    bool CondIsTrue = PBI->getSuccessor(0) == CurrentBB;
     Optional<bool> Implication =
-      isImpliedCondition(PBI->getCondition(), Cond, DL, FalseDest);
+        isImpliedCondition(PBI->getCondition(), Cond, DL, CondIsTrue);
     if (Implication) {
       BI->getSuccessor(*Implication ? 1 : 0)->removePredecessor(BB);
       BranchInst::Create(BI->getSuccessor(*Implication ? 0 : 1), BI);
@@ -1124,7 +1267,9 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   LI->getAAMetadata(AATags);
 
   SmallPtrSet<BasicBlock*, 8> PredsScanned;
-  typedef SmallVector<std::pair<BasicBlock*, Value*>, 8> AvailablePredsTy;
+
+  using AvailablePredsTy = SmallVector<std::pair<BasicBlock *, Value *>, 8>;
+
   AvailablePredsTy AvailablePreds;
   BasicBlock *OneUnavailablePred = nullptr;
   SmallVector<LoadInst*, 8> CSELoads;
@@ -1283,8 +1428,8 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
 /// the list.
 static BasicBlock *
 FindMostPopularDest(BasicBlock *BB,
-                    const SmallVectorImpl<std::pair<BasicBlock*,
-                                  BasicBlock*> > &PredToDestList) {
+                    const SmallVectorImpl<std::pair<BasicBlock *,
+                                          BasicBlock *>> &PredToDestList) {
   assert(!PredToDestList.empty());
 
   // Determine popularity.  If there are multiple possible destinations, we
@@ -1502,7 +1647,6 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
 /// ProcessBranchOnPHI - We have an otherwise unthreadable conditional branch on
 /// a PHI node in the current block.  See if there are any simplifications we
 /// can do based on inputs to the phi node.
-///
 bool JumpThreadingPass::ProcessBranchOnPHI(PHINode *PN) {
   BasicBlock *BB = PN->getParent();
 
@@ -1532,7 +1676,6 @@ bool JumpThreadingPass::ProcessBranchOnPHI(PHINode *PN) {
 /// ProcessBranchOnXOR - We have an otherwise unthreadable conditional branch on
 /// a xor instruction in the current block.  See if there are any
 /// simplifications we can do based on inputs to the xor.
-///
 bool JumpThreadingPass::ProcessBranchOnXOR(BinaryOperator *BO) {
   BasicBlock *BB = BO->getParent();
 
@@ -1637,7 +1780,6 @@ bool JumpThreadingPass::ProcessBranchOnXOR(BinaryOperator *BO) {
   return DuplicateCondBranchOnPHIIntoPred(BB, BlocksToFoldInto);
 }
 
-
 /// AddPHINodeEntriesForMappedBlock - We're adding 'NewPred' as a new
 /// predecessor to the PHIBB block.  If it has PHI nodes, add entries for
 /// NewPred using the entries from OldPred (suitably mapped).
@@ -1677,10 +1819,15 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
 
   // If threading this would thread across a loop header, don't thread the edge.
   // See the comments above FindLoopHeaders for justifications and caveats.
-  if (LoopHeaders.count(BB)) {
-    DEBUG(dbgs() << "  Not threading across loop header BB '" << BB->getName()
-          << "' to dest BB '" << SuccBB->getName()
-          << "' - it might create an irreducible loop!\n");
+  if (LoopHeaders.count(BB) || LoopHeaders.count(SuccBB)) {
+    DEBUG({
+      bool BBIsHeader = LoopHeaders.count(BB);
+      bool SuccIsHeader = LoopHeaders.count(SuccBB);
+      dbgs() << "  Not threading across "
+          << (BBIsHeader ? "loop header BB '" : "block BB '") << BB->getName()
+          << "' to dest " << (SuccIsHeader ? "loop header BB '" : "block BB '")
+          << SuccBB->getName() << "' - it might create an irreducible loop!\n";
+    });
     return false;
   }
 
@@ -1795,7 +1942,6 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
     DEBUG(dbgs() << "\n");
   }
 
-
   // Ok, NewBB is good to go.  Update the terminator of PredBB to jump to
   // NewBB instead of BB.  This eliminates predecessors from BB, which requires
   // us to simplify any PHI nodes in BB.
@@ -2194,7 +2340,7 @@ bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
 ///   %p = phi [0, %bb1], [1, %bb2], [0, %bb3], [1, %bb4], ...
 ///   %c = cmp %p, 0
 ///   %s = select %c, trueval, falseval
-//
+///
 /// And expand the select into a branch structure. This later enables
 /// jump-threading over bb in this pass.
 ///
@@ -2280,6 +2426,7 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
 /// guard is then threaded to one of them.
 bool JumpThreadingPass::ProcessGuards(BasicBlock *BB) {
   using namespace PatternMatch;
+
   // We only want to deal with two predecessors.
   BasicBlock *Pred1, *Pred2;
   auto PI = pred_begin(BB), PE = pred_end(BB);
@@ -2331,8 +2478,7 @@ bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard,
     TrueDestIsSafe = true;
   else {
     // False dest is safe if !BranchCond => GuardCond.
-    Impl =
-        isImpliedCondition(BranchCond, GuardCond, DL, /* InvertAPred */ true);
+    Impl = isImpliedCondition(BranchCond, GuardCond, DL, /* LHSIsTrue */ false);
     if (Impl && *Impl)
       FalseDestIsSafe = true;
   }
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 37b9c4b1094e..4ea935793b80 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -42,7 +42,8 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -62,6 +63,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -88,15 +90,15 @@ static cl::opt<uint32_t> MaxNumUsesTraversed(
              "invariance in loop using invariant start (default = 8)"));
 
 static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
-static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
-                            const LoopSafetyInfo *SafetyInfo);
+static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
+                                  const LoopSafetyInfo *SafetyInfo,
+                                  TargetTransformInfo *TTI, bool &FreeInLoop);
 static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
                   const LoopSafetyInfo *SafetyInfo,
                   OptimizationRemarkEmitter *ORE);
-static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
-                 const Loop *CurLoop, AliasSetTracker *CurAST,
-                 const LoopSafetyInfo *SafetyInfo,
-                 OptimizationRemarkEmitter *ORE);
+static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
+                 const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo,
+                 OptimizationRemarkEmitter *ORE, bool FreeInLoop);
 static bool isSafeToExecuteUnconditionally(Instruction &Inst,
                                            const DominatorTree *DT,
                                            const Loop *CurLoop,
@@ -114,7 +116,8 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
 namespace {
 struct LoopInvariantCodeMotion {
   bool runOnLoop(Loop *L, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT,
-                 TargetLibraryInfo *TLI, ScalarEvolution *SE,
+                 TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
+                 ScalarEvolution *SE, MemorySSA *MSSA,
                  OptimizationRemarkEmitter *ORE, bool DeleteAST);
 
   DenseMap<Loop *, AliasSetTracker *> &getLoopToAliasSetMap() {
@@ -146,6 +149,9 @@ struct LegacyLICMPass : public LoopPass {
     }
 
     auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+    MemorySSA *MSSA = EnableMSSALoopDependency
+                          ? (&getAnalysis<MemorySSAWrapperPass>().getMSSA())
+                          : nullptr;
     // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
     // pass.  Function analyses need to be preserved across loop transformations
     // but ORE cannot be preserved (see comment before the pass definition).
@@ -155,7 +161,9 @@ struct LegacyLICMPass : public LoopPass {
                           &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
                           &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
                           &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
-                          SE ? &SE->getSE() : nullptr, &ORE, false);
+                          &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+                              *L->getHeader()->getParent()),
+                          SE ? &SE->getSE() : nullptr, MSSA, &ORE, false);
   }
 
   /// This transformation requires natural loop information & requires that
@@ -164,6 +172,9 @@ struct LegacyLICMPass : public LoopPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
+    if (EnableMSSALoopDependency)
+      AU.addRequired<MemorySSAWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
     getLoopAnalysisUsage(AU);
   }
 
@@ -189,7 +200,7 @@ private:
   /// Simple Analysis hook. Delete loop L from alias set map.
   void deleteAnalysisLoop(Loop *L) override;
 };
-}
+} // namespace
 
 PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
                                 LoopStandardAnalysisResults &AR, LPMUpdater &) {
@@ -204,7 +215,8 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
                        "cached at a higher level");
 
   LoopInvariantCodeMotion LICM;
-  if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.SE, ORE, true))
+  if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.TTI, &AR.SE,
+                      AR.MSSA, ORE, true))
     return PreservedAnalyses::all();
 
   auto PA = getLoopPassPreservedAnalyses();
@@ -217,6 +229,8 @@ INITIALIZE_PASS_BEGIN(LegacyLICMPass, "licm", "Loop Invariant Code Motion",
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(LoopPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
 INITIALIZE_PASS_END(LegacyLICMPass, "licm", "Loop Invariant Code Motion", false,
                     false)
 
@@ -228,12 +242,10 @@ Pass *llvm::createLICMPass() { return new LegacyLICMPass(); }
 /// We should delete AST for inner loops in the new pass manager to avoid
 /// memory leak.
 ///
-bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
-                                        LoopInfo *LI, DominatorTree *DT,
-                                        TargetLibraryInfo *TLI,
-                                        ScalarEvolution *SE,
-                                        OptimizationRemarkEmitter *ORE,
-                                        bool DeleteAST) {
+bool LoopInvariantCodeMotion::runOnLoop(
+    Loop *L, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT,
+    TargetLibraryInfo *TLI, TargetTransformInfo *TTI, ScalarEvolution *SE,
+    MemorySSA *MSSA, OptimizationRemarkEmitter *ORE, bool DeleteAST) {
   bool Changed = false;
 
   assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
@@ -258,7 +270,7 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
   // instructions, we perform another pass to hoist them out of the loop.
   //
   if (L->hasDedicatedExits())
-    Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L,
+    Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L,
                           CurAST, &SafetyInfo, ORE);
   if (Preheader)
     Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L,
@@ -292,10 +304,26 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
       bool Promoted = false;
 
       // Loop over all of the alias sets in the tracker object.
-      for (AliasSet &AS : *CurAST)
-        Promoted |=
-            promoteLoopAccessesToScalars(AS, ExitBlocks, InsertPts, PIC, LI, DT,
-                                         TLI, L, CurAST, &SafetyInfo, ORE);
+      for (AliasSet &AS : *CurAST) {
+        // We can promote this alias set if it has a store, if it is a "Must"
+        // alias set, if the pointer is loop invariant, and if we are not
+        // eliminating any volatile loads or stores.
+        if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() ||
+            AS.isVolatile() || !L->isLoopInvariant(AS.begin()->getValue()))
+          continue;
+
+        assert(
+            !AS.empty() &&
+            "Must alias set should have at least one pointer element in it!");
+
+        SmallSetVector<Value *, 8> PointerMustAliases;
+        for (const auto &ASI : AS)
+          PointerMustAliases.insert(ASI.getValue());
+
+        Promoted |= promoteLoopAccessesToScalars(PointerMustAliases, ExitBlocks,
+                                                 InsertPts, PIC, LI, DT, TLI, L,
+                                                 CurAST, &SafetyInfo, ORE);
+      }
 
       // Once we have promoted values across the loop body we have to
       // recursively reform LCSSA as any nested loop may now have values defined
@@ -335,7 +363,8 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
 /// definitions, allowing us to sink a loop body in one pass without iteration.
 ///
 bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
-                      DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
+                      DominatorTree *DT, TargetLibraryInfo *TLI,
+                      TargetTransformInfo *TTI, Loop *CurLoop,
                       AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
                       OptimizationRemarkEmitter *ORE) {
 
@@ -344,46 +373,50 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
          CurLoop != nullptr && CurAST != nullptr && SafetyInfo != nullptr &&
          "Unexpected input to sinkRegion");
 
-  BasicBlock *BB = N->getBlock();
-  // If this subregion is not in the top level loop at all, exit.
-  if (!CurLoop->contains(BB))
-    return false;
+  // We want to visit children before parents. We will enque all the parents
+  // before their children in the worklist and process the worklist in reverse
+  // order.
+  SmallVector<DomTreeNode *, 16> Worklist = collectChildrenInLoop(N, CurLoop);
 
-  // We are processing blocks in reverse dfo, so process children first.
   bool Changed = false;
-  const std::vector<DomTreeNode *> &Children = N->getChildren();
-  for (DomTreeNode *Child : Children)
-    Changed |=
-        sinkRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo, ORE);
-
-  // Only need to process the contents of this block if it is not part of a
-  // subloop (which would already have been processed).
-  if (inSubLoop(BB, CurLoop, LI))
-    return Changed;
+  for (DomTreeNode *DTN : reverse(Worklist)) {
+    BasicBlock *BB = DTN->getBlock();
+    // Only need to process the contents of this block if it is not part of a
+    // subloop (which would already have been processed).
+    if (inSubLoop(BB, CurLoop, LI))
+      continue;
 
-  for (BasicBlock::iterator II = BB->end(); II != BB->begin();) {
-    Instruction &I = *--II;
+    for (BasicBlock::iterator II = BB->end(); II != BB->begin();) {
+      Instruction &I = *--II;
 
-    // If the instruction is dead, we would try to sink it because it isn't used
-    // in the loop, instead, just delete it.
-    if (isInstructionTriviallyDead(&I, TLI)) {
-      DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n');
-      ++II;
-      CurAST->deleteValue(&I);
-      I.eraseFromParent();
-      Changed = true;
-      continue;
-    }
+      // If the instruction is dead, we would try to sink it because it isn't
+      // used in the loop, instead, just delete it.
+      if (isInstructionTriviallyDead(&I, TLI)) {
+        DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n');
+        ++II;
+        CurAST->deleteValue(&I);
+        I.eraseFromParent();
+        Changed = true;
+        continue;
+      }
 
-    // Check to see if we can sink this instruction to the exit blocks
-    // of the loop.  We can do this if the all users of the instruction are
-    // outside of the loop.  In this case, it doesn't even matter if the
-    // operands of the instruction are loop invariant.
-    //
-    if (isNotUsedInLoop(I, CurLoop, SafetyInfo) &&
-        canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE)) {
-      ++II;
-      Changed |= sink(I, LI, DT, CurLoop, CurAST, SafetyInfo, ORE);
+      // Check to see if we can sink this instruction to the exit blocks
+      // of the loop.  We can do this if the all users of the instruction are
+      // outside of the loop.  In this case, it doesn't even matter if the
+      // operands of the instruction are loop invariant.
+      //
+      bool FreeInLoop = false;
+      if (isNotUsedOrFreeInLoop(I, CurLoop, SafetyInfo, TTI, FreeInLoop) &&
+          canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE)) {
+        if (sink(I, LI, DT, CurLoop, SafetyInfo, ORE, FreeInLoop)) {
+          if (!FreeInLoop) {
+            ++II;
+            CurAST->deleteValue(&I);
+            I.eraseFromParent();
+          }
+          Changed = true;
+        }
+      }
     }
   }
   return Changed;
@@ -403,73 +436,70 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
          CurLoop != nullptr && CurAST != nullptr && SafetyInfo != nullptr &&
          "Unexpected input to hoistRegion");
 
-  BasicBlock *BB = N->getBlock();
-
-  // If this subregion is not in the top level loop at all, exit.
-  if (!CurLoop->contains(BB))
-    return false;
+  // We want to visit parents before children. We will enque all the parents
+  // before their children in the worklist and process the worklist in order.
+  SmallVector<DomTreeNode *, 16> Worklist = collectChildrenInLoop(N, CurLoop);
 
-  // Only need to process the contents of this block if it is not part of a
-  // subloop (which would already have been processed).
   bool Changed = false;
-  if (!inSubLoop(BB, CurLoop, LI))
-    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
-      Instruction &I = *II++;
-      // Try constant folding this instruction.  If all the operands are
-      // constants, it is technically hoistable, but it would be better to just
-      // fold it.
-      if (Constant *C = ConstantFoldInstruction(
-              &I, I.getModule()->getDataLayout(), TLI)) {
-        DEBUG(dbgs() << "LICM folding inst: " << I << "  --> " << *C << '\n');
-        CurAST->copyValue(&I, C);
-        I.replaceAllUsesWith(C);
-        if (isInstructionTriviallyDead(&I, TLI)) {
-          CurAST->deleteValue(&I);
-          I.eraseFromParent();
+  for (DomTreeNode *DTN : Worklist) {
+    BasicBlock *BB = DTN->getBlock();
+    // Only need to process the contents of this block if it is not part of a
+    // subloop (which would already have been processed).
+    if (!inSubLoop(BB, CurLoop, LI))
+      for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
+        Instruction &I = *II++;
+        // Try constant folding this instruction.  If all the operands are
+        // constants, it is technically hoistable, but it would be better to
+        // just fold it.
+        if (Constant *C = ConstantFoldInstruction(
+                &I, I.getModule()->getDataLayout(), TLI)) {
+          DEBUG(dbgs() << "LICM folding inst: " << I << "  --> " << *C << '\n');
+          CurAST->copyValue(&I, C);
+          I.replaceAllUsesWith(C);
+          if (isInstructionTriviallyDead(&I, TLI)) {
+            CurAST->deleteValue(&I);
+            I.eraseFromParent();
+          }
+          Changed = true;
+          continue;
         }
-        Changed = true;
-        continue;
-      }
 
-      // Attempt to remove floating point division out of the loop by converting
-      // it to a reciprocal multiplication.
-      if (I.getOpcode() == Instruction::FDiv &&
-          CurLoop->isLoopInvariant(I.getOperand(1)) &&
-          I.hasAllowReciprocal()) {
-        auto Divisor = I.getOperand(1);
-        auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0);
-        auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor);
-        ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags());
-        ReciprocalDivisor->insertBefore(&I);
+        // Attempt to remove floating point division out of the loop by
+        // converting it to a reciprocal multiplication.
+        if (I.getOpcode() == Instruction::FDiv &&
+            CurLoop->isLoopInvariant(I.getOperand(1)) &&
+            I.hasAllowReciprocal()) {
+          auto Divisor = I.getOperand(1);
+          auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0);
+          auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor);
+          ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags());
+          ReciprocalDivisor->insertBefore(&I);
 
-        auto Product = BinaryOperator::CreateFMul(I.getOperand(0),
-                                                  ReciprocalDivisor);
-        Product->setFastMathFlags(I.getFastMathFlags());
-        Product->insertAfter(&I);
-        I.replaceAllUsesWith(Product);
-        I.eraseFromParent();
+          auto Product =
+              BinaryOperator::CreateFMul(I.getOperand(0), ReciprocalDivisor);
+          Product->setFastMathFlags(I.getFastMathFlags());
+          Product->insertAfter(&I);
+          I.replaceAllUsesWith(Product);
+          I.eraseFromParent();
 
-        hoist(*ReciprocalDivisor, DT, CurLoop, SafetyInfo, ORE);
-        Changed = true;
-        continue;
-      }
+          hoist(*ReciprocalDivisor, DT, CurLoop, SafetyInfo, ORE);
+          Changed = true;
+          continue;
+        }
 
-      // Try hoisting the instruction out to the preheader.  We can only do this
-      // if all of the operands of the instruction are loop invariant and if it
-      // is safe to hoist the instruction.
-      //
-      if (CurLoop->hasLoopInvariantOperands(&I) &&
-          canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE) &&
-          isSafeToExecuteUnconditionally(
-              I, DT, CurLoop, SafetyInfo, ORE,
-              CurLoop->getLoopPreheader()->getTerminator()))
-        Changed |= hoist(I, DT, CurLoop, SafetyInfo, ORE);
-    }
+        // Try hoisting the instruction out to the preheader.  We can only do
+        // this if all of the operands of the instruction are loop invariant and
+        // if it is safe to hoist the instruction.
+        //
+        if (CurLoop->hasLoopInvariantOperands(&I) &&
+            canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE) &&
+            isSafeToExecuteUnconditionally(
+                I, DT, CurLoop, SafetyInfo, ORE,
+                CurLoop->getLoopPreheader()->getTerminator()))
+          Changed |= hoist(I, DT, CurLoop, SafetyInfo, ORE);
+      }
+  }
 
-  const std::vector<DomTreeNode *> &Children = N->getChildren();
-  for (DomTreeNode *Child : Children)
-    Changed |=
-        hoistRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo, ORE);
   return Changed;
 }
 
@@ -492,7 +522,8 @@ void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) {
   // Iterate over loop instructions and compute safety info.
   // Skip header as it has been computed and stored in HeaderMayThrow.
   // The first block in loopinfo.Blocks is guaranteed to be the header.
-  assert(Header == *CurLoop->getBlocks().begin() && "First block must be header");
+  assert(Header == *CurLoop->getBlocks().begin() &&
+         "First block must be header");
   for (Loop::block_iterator BB = std::next(CurLoop->block_begin()),
                             BBE = CurLoop->block_end();
        (BB != BBE) && !SafetyInfo->MayThrow; ++BB)
@@ -510,9 +541,9 @@ void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) {
 }
 
 // Return true if LI is invariant within scope of the loop. LI is invariant if
-// CurLoop is dominated by an invariant.start representing the same memory location
-// and size as the memory location LI loads from, and also the invariant.start
-// has no uses.
+// CurLoop is dominated by an invariant.start representing the same memory
+// location and size as the memory location LI loads from, and also the
+// invariant.start has no uses.
 static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
                                   Loop *CurLoop) {
   Value *Addr = LI->getOperand(0);
@@ -566,10 +597,13 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
                               Loop *CurLoop, AliasSetTracker *CurAST,
                               LoopSafetyInfo *SafetyInfo,
                               OptimizationRemarkEmitter *ORE) {
+  // SafetyInfo is nullptr if we are checking for sinking from preheader to
+  // loop body.
+  const bool SinkingToLoopBody = !SafetyInfo;
   // Loads have extra constraints we have to verify before we can hoist them.
   if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
     if (!LI->isUnordered())
-      return false; // Don't hoist volatile/atomic loads!
+      return false; // Don't sink/hoist volatile or ordered atomic loads!
 
     // Loads from constant memory are always safe to move, even if they end up
     // in the same alias set as something that ends up being modified.
@@ -578,6 +612,9 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
     if (LI->getMetadata(LLVMContext::MD_invariant_load))
       return true;
 
+    if (LI->isAtomic() && SinkingToLoopBody)
+      return false; // Don't sink unordered atomic loads to loop body.
+
     // This checks for an invariant.start dominating the load.
     if (isLoadInvariantInLoop(LI, DT, CurLoop))
       return true;
@@ -595,10 +632,12 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
     // Check loop-invariant address because this may also be a sinkable load
     // whose address is not necessarily loop-invariant.
     if (ORE && Invalidated && CurLoop->isLoopInvariant(LI->getPointerOperand()))
-      ORE->emit(OptimizationRemarkMissed(
-                    DEBUG_TYPE, "LoadWithLoopInvariantAddressInvalidated", LI)
-                << "failed to move load with loop-invariant address "
-                   "because the loop may invalidate its value");
+      ORE->emit([&]() {
+        return OptimizationRemarkMissed(
+                   DEBUG_TYPE, "LoadWithLoopInvariantAddressInvalidated", LI)
+               << "failed to move load with loop-invariant address "
+                  "because the loop may invalidate its value";
+      });
 
     return !Invalidated;
   } else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
@@ -653,9 +692,9 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
       !isa<InsertValueInst>(I))
     return false;
 
-  // SafetyInfo is nullptr if we are checking for sinking from preheader to
-  // loop body. It will be always safe as there is no speculative execution.
-  if (!SafetyInfo)
+  // If we are checking for sinking from preheader to loop body it will be
+  // always safe as there is no speculative execution.
+  if (SinkingToLoopBody)
     return true;
 
   // TODO: Plumb the context instruction through to make hoisting and sinking
@@ -677,13 +716,40 @@ static bool isTriviallyReplacablePHI(const PHINode &PN, const Instruction &I) {
   return true;
 }
 
+/// Return true if the instruction is free in the loop.
+static bool isFreeInLoop(const Instruction &I, const Loop *CurLoop,
+                         const TargetTransformInfo *TTI) {
+
+  if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+    if (TTI->getUserCost(GEP) != TargetTransformInfo::TCC_Free)
+      return false;
+    // For a GEP, we cannot simply use getUserCost because currently it
+    // optimistically assume that a GEP will fold into addressing mode
+    // regardless of its users.
+    const BasicBlock *BB = GEP->getParent();
+    for (const User *U : GEP->users()) {
+      const Instruction *UI = cast<Instruction>(U);
+      if (CurLoop->contains(UI) &&
+          (BB != UI->getParent() ||
+           (!isa<StoreInst>(UI) && !isa<LoadInst>(UI))))
+        return false;
+    }
+    return true;
+  } else
+    return TTI->getUserCost(&I) == TargetTransformInfo::TCC_Free;
+}
+
 /// Return true if the only users of this instruction are outside of
 /// the loop. If this is true, we can sink the instruction to the exit
 /// blocks of the loop.
 ///
-static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
-                            const LoopSafetyInfo *SafetyInfo) {
+/// We also return true if the instruction could be folded away in lowering.
+/// (e.g.,  a GEP can be folded into a load as an addressing mode in the loop).
+static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
+                                  const LoopSafetyInfo *SafetyInfo,
+                                  TargetTransformInfo *TTI, bool &FreeInLoop) {
   const auto &BlockColors = SafetyInfo->BlockColors;
+  bool IsFree = isFreeInLoop(I, CurLoop, TTI);
   for (const User *U : I.users()) {
     const Instruction *UI = cast<Instruction>(U);
     if (const PHINode *PN = dyn_cast<PHINode>(UI)) {
@@ -698,30 +764,15 @@ static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
         if (!BlockColors.empty() &&
             BlockColors.find(const_cast<BasicBlock *>(BB))->second.size() != 1)
           return false;
-
-      // A PHI node where all of the incoming values are this instruction are
-      // special -- they can just be RAUW'ed with the instruction and thus
-      // don't require a use in the predecessor. This is a particular important
-      // special case because it is the pattern found in LCSSA form.
-      if (isTriviallyReplacablePHI(*PN, I)) {
-        if (CurLoop->contains(PN))
-          return false;
-        else
-          continue;
-      }
-
-      // Otherwise, PHI node uses occur in predecessor blocks if the incoming
-      // values. Check for such a use being inside the loop.
-      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-        if (PN->getIncomingValue(i) == &I)
-          if (CurLoop->contains(PN->getIncomingBlock(i)))
-            return false;
-
-      continue;
     }
 
-    if (CurLoop->contains(UI))
+    if (CurLoop->contains(UI)) {
+      if (IsFree) {
+        FreeInLoop = true;
+        continue;
+      }
       return false;
+    }
   }
   return true;
 }
@@ -787,77 +838,189 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
   return New;
 }
 
+static Instruction *sinkThroughTriviallyReplacablePHI(
+    PHINode *TPN, Instruction *I, LoopInfo *LI,
+    SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies,
+    const LoopSafetyInfo *SafetyInfo, const Loop *CurLoop) {
+  assert(isTriviallyReplacablePHI(*TPN, *I) &&
+         "Expect only trivially replacalbe PHI");
+  BasicBlock *ExitBlock = TPN->getParent();
+  Instruction *New;
+  auto It = SunkCopies.find(ExitBlock);
+  if (It != SunkCopies.end())
+    New = It->second;
+  else
+    New = SunkCopies[ExitBlock] =
+        CloneInstructionInExitBlock(*I, *ExitBlock, *TPN, LI, SafetyInfo);
+  return New;
+}
+
+static bool canSplitPredecessors(PHINode *PN) {
+  BasicBlock *BB = PN->getParent();
+  if (!BB->canSplitPredecessors())
+    return false;
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+    BasicBlock *BBPred = *PI;
+    if (isa<IndirectBrInst>(BBPred->getTerminator()))
+      return false;
+  }
+  return true;
+}
+
+static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
+                                        LoopInfo *LI, const Loop *CurLoop) {
+#ifndef NDEBUG
+  SmallVector<BasicBlock *, 32> ExitBlocks;
+  CurLoop->getUniqueExitBlocks(ExitBlocks);
+  SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(),
+                                             ExitBlocks.end());
+#endif
+  BasicBlock *ExitBB = PN->getParent();
+  assert(ExitBlockSet.count(ExitBB) && "Expect the PHI is in an exit block.");
+
+  // Split predecessors of the loop exit to make instructions in the loop are
+  // exposed to exit blocks through trivially replacable PHIs while keeping the
+  // loop in the canonical form where each predecessor of each exit block should
+  // be contained within the loop. For example, this will convert the loop below
+  // from
+  //
+  // LB1:
+  //   %v1 =
+  //   br %LE, %LB2
+  // LB2:
+  //   %v2 =
+  //   br %LE, %LB1
+  // LE:
+  //   %p = phi [%v1, %LB1], [%v2, %LB2] <-- non-trivially replacable
+  //
+  // to
+  //
+  // LB1:
+  //   %v1 =
+  //   br %LE.split, %LB2
+  // LB2:
+  //   %v2 =
+  //   br %LE.split2, %LB1
+  // LE.split:
+  //   %p1 = phi [%v1, %LB1]  <-- trivially replacable
+  //   br %LE
+  // LE.split2:
+  //   %p2 = phi [%v2, %LB2]  <-- trivially replacable
+  //   br %LE
+  // LE:
+  //   %p = phi [%p1, %LE.split], [%p2, %LE.split2]
+  //
+  SmallSetVector<BasicBlock *, 8> PredBBs(pred_begin(ExitBB), pred_end(ExitBB));
+  while (!PredBBs.empty()) {
+    BasicBlock *PredBB = *PredBBs.begin();
+    assert(CurLoop->contains(PredBB) &&
+           "Expect all predecessors are in the loop");
+    if (PN->getBasicBlockIndex(PredBB) >= 0)
+      SplitBlockPredecessors(ExitBB, PredBB, ".split.loop.exit", DT, LI, true);
+    PredBBs.remove(PredBB);
+  }
+}
+
 /// When an instruction is found to only be used outside of the loop, this
 /// function moves it to the exit blocks and patches up SSA form as needed.
 /// This method is guaranteed to remove the original instruction from its
 /// position, and may either delete it or move it to outside of the loop.
 ///
-static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
-                 const Loop *CurLoop, AliasSetTracker *CurAST,
-                 const LoopSafetyInfo *SafetyInfo,
-                 OptimizationRemarkEmitter *ORE) {
+static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
+                 const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo,
+                 OptimizationRemarkEmitter *ORE, bool FreeInLoop) {
   DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
-  ORE->emit(OptimizationRemark(DEBUG_TYPE, "InstSunk", &I)
-            << "sinking " << ore::NV("Inst", &I));
+  ORE->emit([&]() {
+    return OptimizationRemark(DEBUG_TYPE, "InstSunk", &I)
+           << "sinking " << ore::NV("Inst", &I);
+  });
   bool Changed = false;
   if (isa<LoadInst>(I))
     ++NumMovedLoads;
   else if (isa<CallInst>(I))
     ++NumMovedCalls;
   ++NumSunk;
-  Changed = true;
 
-#ifndef NDEBUG
-  SmallVector<BasicBlock *, 32> ExitBlocks;
-  CurLoop->getUniqueExitBlocks(ExitBlocks);
-  SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(),
-                                             ExitBlocks.end());
-#endif
+  // Iterate over users to be ready for actual sinking. Replace users via
+  // unrechable blocks with undef and make all user PHIs trivially replcable.
+  SmallPtrSet<Instruction *, 8> VisitedUsers;
+  for (Value::user_iterator UI = I.user_begin(), UE = I.user_end(); UI != UE;) {
+    auto *User = cast<Instruction>(*UI);
+    Use &U = UI.getUse();
+    ++UI;
 
-  // Clones of this instruction. Don't create more than one per exit block!
-  SmallDenseMap<BasicBlock *, Instruction *, 32> SunkCopies;
+    if (VisitedUsers.count(User) || CurLoop->contains(User))
+      continue;
 
-  // If this instruction is only used outside of the loop, then all users are
-  // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of
-  // the instruction.
-  while (!I.use_empty()) {
-    Value::user_iterator UI = I.user_begin();
-    auto *User = cast<Instruction>(*UI);
     if (!DT->isReachableFromEntry(User->getParent())) {
-      User->replaceUsesOfWith(&I, UndefValue::get(I.getType()));
+      U = UndefValue::get(I.getType());
+      Changed = true;
       continue;
     }
+
     // The user must be a PHI node.
     PHINode *PN = cast<PHINode>(User);
 
     // Surprisingly, instructions can be used outside of loops without any
     // exits.  This can only happen in PHI nodes if the incoming block is
     // unreachable.
-    Use &U = UI.getUse();
     BasicBlock *BB = PN->getIncomingBlock(U);
     if (!DT->isReachableFromEntry(BB)) {
       U = UndefValue::get(I.getType());
+      Changed = true;
       continue;
     }
 
-    BasicBlock *ExitBlock = PN->getParent();
-    assert(ExitBlockSet.count(ExitBlock) &&
-           "The LCSSA PHI is not in an exit block!");
+    VisitedUsers.insert(PN);
+    if (isTriviallyReplacablePHI(*PN, I))
+      continue;
 
-    Instruction *New;
-    auto It = SunkCopies.find(ExitBlock);
-    if (It != SunkCopies.end())
-      New = It->second;
-    else
-      New = SunkCopies[ExitBlock] =
-          CloneInstructionInExitBlock(I, *ExitBlock, *PN, LI, SafetyInfo);
+    if (!canSplitPredecessors(PN))
+      return Changed;
+
+    // Split predecessors of the PHI so that we can make users trivially
+    // replacable.
+    splitPredecessorsOfLoopExit(PN, DT, LI, CurLoop);
+
+    // Should rebuild the iterators, as they may be invalidated by
+    // splitPredecessorsOfLoopExit().
+    UI = I.user_begin();
+    UE = I.user_end();
+  }
+
+  if (VisitedUsers.empty())
+    return Changed;
+
+#ifndef NDEBUG
+  SmallVector<BasicBlock *, 32> ExitBlocks;
+  CurLoop->getUniqueExitBlocks(ExitBlocks);
+  SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(),
+                                             ExitBlocks.end());
+#endif
+
+  // Clones of this instruction. Don't create more than one per exit block!
+  SmallDenseMap<BasicBlock *, Instruction *, 32> SunkCopies;
+
+  // If this instruction is only used outside of the loop, then all users are
+  // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of
+  // the instruction.
+  SmallSetVector<User*, 8> Users(I.user_begin(), I.user_end());
+  for (auto *UI : Users) {
+    auto *User = cast<Instruction>(UI);
+
+    if (CurLoop->contains(User))
+      continue;
 
+    PHINode *PN = cast<PHINode>(User);
+    assert(ExitBlockSet.count(PN->getParent()) &&
+           "The LCSSA PHI is not in an exit block!");
+    // The PHI must be trivially replacable.
+    Instruction *New = sinkThroughTriviallyReplacablePHI(PN, &I, LI, SunkCopies,
+                                                         SafetyInfo, CurLoop);
     PN->replaceAllUsesWith(New);
     PN->eraseFromParent();
+    Changed = true;
   }
-
-  CurAST->deleteValue(&I);
-  I.eraseFromParent();
   return Changed;
 }
 
@@ -870,8 +1033,10 @@ static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
   auto *Preheader = CurLoop->getLoopPreheader();
   DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I
                << "\n");
-  ORE->emit(OptimizationRemark(DEBUG_TYPE, "Hoisted", &I)
-            << "hoisting " << ore::NV("Inst", &I));
+  ORE->emit([&]() {
+    return OptimizationRemark(DEBUG_TYPE, "Hoisted", &I) << "hoisting "
+                                                         << ore::NV("Inst", &I);
+  });
 
   // Metadata can be dependent on conditions we are hoisting above.
   // Conservatively strip all metadata on the instruction unless we were
@@ -921,10 +1086,12 @@ static bool isSafeToExecuteUnconditionally(Instruction &Inst,
   if (!GuaranteedToExecute) {
     auto *LI = dyn_cast<LoadInst>(&Inst);
     if (LI && CurLoop->isLoopInvariant(LI->getPointerOperand()))
-      ORE->emit(OptimizationRemarkMissed(
-                    DEBUG_TYPE, "LoadWithLoopInvariantAddressCondExecuted", LI)
-                << "failed to hoist load with loop-invariant address "
-                   "because load is conditionally executed");
+      ORE->emit([&]() {
+        return OptimizationRemarkMissed(
+                   DEBUG_TYPE, "LoadWithLoopInvariantAddressCondExecuted", LI)
+               << "failed to hoist load with loop-invariant address "
+                  "because load is conditionally executed";
+      });
   }
 
   return GuaranteedToExecute;
@@ -933,7 +1100,7 @@ static bool isSafeToExecuteUnconditionally(Instruction &Inst,
 namespace {
 class LoopPromoter : public LoadAndStorePromoter {
   Value *SomePtr; // Designated pointer to store to.
-  SmallPtrSetImpl<Value *> &PointerMustAliases;
+  const SmallSetVector<Value *, 8> &PointerMustAliases;
   SmallVectorImpl<BasicBlock *> &LoopExitBlocks;
   SmallVectorImpl<Instruction *> &LoopInsertPts;
   PredIteratorCache &PredCache;
@@ -961,7 +1128,7 @@ class LoopPromoter : public LoadAndStorePromoter {
 
 public:
   LoopPromoter(Value *SP, ArrayRef<const Instruction *> Insts, SSAUpdater &S,
-               SmallPtrSetImpl<Value *> &PMA,
+               const SmallSetVector<Value *, 8> &PMA,
                SmallVectorImpl<BasicBlock *> &LEB,
                SmallVectorImpl<Instruction *> &LIP, PredIteratorCache &PIC,
                AliasSetTracker &ast, LoopInfo &li, DebugLoc dl, int alignment,
@@ -969,7 +1136,7 @@ public:
       : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
         LoopExitBlocks(LEB), LoopInsertPts(LIP), PredCache(PIC), AST(ast),
         LI(li), DL(std::move(dl)), Alignment(alignment),
-        UnorderedAtomic(UnorderedAtomic),AATags(AATags) {}
+        UnorderedAtomic(UnorderedAtomic), AATags(AATags) {}
 
   bool isInstInList(Instruction *I,
                     const SmallVectorImpl<Instruction *> &) const override {
@@ -1008,7 +1175,31 @@ public:
   }
   void instructionDeleted(Instruction *I) const override { AST.deleteValue(I); }
 };
-} // end anon namespace
+
+
+/// Return true iff we can prove that a caller of this function can not inspect
+/// the contents of the provided object in a well defined program.
+bool isKnownNonEscaping(Value *Object, const TargetLibraryInfo *TLI) {
+  if (isa<AllocaInst>(Object))
+    // Since the alloca goes out of scope, we know the caller can't retain a
+    // reference to it and be well defined.  Thus, we don't need to check for
+    // capture. 
+    return true;
+  
+  // For all other objects we need to know that the caller can't possibly
+  // have gotten a reference to the object.  There are two components of
+  // that:
+  //   1) Object can't be escaped by this function.  This is what
+  //      PointerMayBeCaptured checks.
+  //   2) Object can't have been captured at definition site.  For this, we
+  //      need to know the return value is noalias.  At the moment, we use a
+  //      weaker condition and handle only AllocLikeFunctions (which are
+  //      known to be noalias).  TODO
+  return isAllocLikeFn(Object, TLI) &&
+    !PointerMayBeCaptured(Object, true, true);
+}
+
+} // namespace
 
 /// Try to promote memory values to scalars by sinking stores out of the
 /// loop and moving loads to before the loop.  We do this by looping over
@@ -1016,7 +1207,8 @@ public:
 /// loop invariant.
 ///
 bool llvm::promoteLoopAccessesToScalars(
-    AliasSet &AS, SmallVectorImpl<BasicBlock *> &ExitBlocks,
+    const SmallSetVector<Value *, 8> &PointerMustAliases,
+    SmallVectorImpl<BasicBlock *> &ExitBlocks,
     SmallVectorImpl<Instruction *> &InsertPts, PredIteratorCache &PIC,
     LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
     Loop *CurLoop, AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
@@ -1026,17 +1218,7 @@ bool llvm::promoteLoopAccessesToScalars(
          CurAST != nullptr && SafetyInfo != nullptr &&
          "Unexpected Input to promoteLoopAccessesToScalars");
 
-  // We can promote this alias set if it has a store, if it is a "Must" alias
-  // set, if the pointer is loop invariant, and if we are not eliminating any
-  // volatile loads or stores.
-  if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() ||
-      AS.isVolatile() || !CurLoop->isLoopInvariant(AS.begin()->getValue()))
-    return false;
-
-  assert(!AS.empty() &&
-         "Must alias set should have at least one pointer element in it!");
-
-  Value *SomePtr = AS.begin()->getValue();
+  Value *SomePtr = *PointerMustAliases.begin();
   BasicBlock *Preheader = CurLoop->getLoopPreheader();
 
   // It isn't safe to promote a load/store from the loop if the load/store is
@@ -1065,8 +1247,8 @@ bool llvm::promoteLoopAccessesToScalars(
   // is safe (i.e. proving dereferenceability on all paths through the loop). We
   // can use any access within the alias set to prove dereferenceability,
   // since they're all must alias.
-  // 
-  // There are two ways establish (p2): 
+  //
+  // There are two ways establish (p2):
   // a) Prove the location is thread-local. In this case the memory model
   // requirement does not apply, and stores are safe to insert.
   // b) Prove a store dominates every exit block. In this case, if an exit
@@ -1080,55 +1262,36 @@ bool llvm::promoteLoopAccessesToScalars(
   bool SafeToInsertStore = false;
 
   SmallVector<Instruction *, 64> LoopUses;
-  SmallPtrSet<Value *, 4> PointerMustAliases;
 
   // We start with an alignment of one and try to find instructions that allow
   // us to prove better alignment.
   unsigned Alignment = 1;
   // Keep track of which types of access we see
-  bool SawUnorderedAtomic = false; 
+  bool SawUnorderedAtomic = false;
   bool SawNotAtomic = false;
   AAMDNodes AATags;
 
   const DataLayout &MDL = Preheader->getModule()->getDataLayout();
 
-  // Do we know this object does not escape ?
-  bool IsKnownNonEscapingObject = false;
+  bool IsKnownThreadLocalObject = false;
   if (SafetyInfo->MayThrow) {
     // If a loop can throw, we have to insert a store along each unwind edge.
     // That said, we can't actually make the unwind edge explicit. Therefore,
-    // we have to prove that the store is dead along the unwind edge.
-    //
-    // If the underlying object is not an alloca, nor a pointer that does not
-    // escape, then we can not effectively prove that the store is dead along
-    // the unwind edge. i.e. the caller of this function could have ways to
-    // access the pointed object.
+    // we have to prove that the store is dead along the unwind edge.  We do
+    // this by proving that the caller can't have a reference to the object
+    // after return and thus can't possibly load from the object.  
     Value *Object = GetUnderlyingObject(SomePtr, MDL);
-    // If this is a base pointer we do not understand, simply bail.
-    // We only handle alloca and return value from alloc-like fn right now.
-    if (!isa<AllocaInst>(Object)) {
-        if (!isAllocLikeFn(Object, TLI))
-          return false;
-      // If this is an alloc like fn. There are more constraints we need to verify.
-      // More specifically, we must make sure that the pointer can not escape.
-      //
-      // NOTE: PointerMayBeCaptured is not enough as the pointer may have escaped
-      // even though its not captured by the enclosing function. Standard allocation
-      // functions like malloc, calloc, and operator new return values which can
-      // be assumed not to have previously escaped.
-      if (PointerMayBeCaptured(Object, true, true))
-        return false;
-      IsKnownNonEscapingObject = true;
-    }
+    if (!isKnownNonEscaping(Object, TLI))
+      return false;
+    // Subtlety: Alloca's aren't visible to callers, but *are* potentially
+    // visible to other threads if captured and used during their lifetimes.
+    IsKnownThreadLocalObject = !isa<AllocaInst>(Object);
   }
 
   // Check that all of the pointers in the alias set have the same type.  We
   // cannot (yet) promote a memory location that is loaded and stored in
   // different sizes.  While we are at it, collect alignment and AA info.
-  for (const auto &ASI : AS) {
-    Value *ASIV = ASI.getValue();
-    PointerMustAliases.insert(ASIV);
-
+  for (Value *ASIV : PointerMustAliases) {
     // Check that all of the pointers in the alias set have the same type.  We
     // cannot (yet) promote a memory location that is loaded and stored in
     // different sizes.
@@ -1147,7 +1310,7 @@ bool llvm::promoteLoopAccessesToScalars(
         assert(!Load->isVolatile() && "AST broken");
         if (!Load->isUnordered())
           return false;
-        
+
         SawUnorderedAtomic |= Load->isAtomic();
         SawNotAtomic |= !Load->isAtomic();
 
@@ -1234,14 +1397,13 @@ bool llvm::promoteLoopAccessesToScalars(
   // stores along paths which originally didn't have them without violating the
   // memory model.
   if (!SafeToInsertStore) {
-    // If this is a known non-escaping object, it is safe to insert the stores.
-    if (IsKnownNonEscapingObject)
+    if (IsKnownThreadLocalObject)
       SafeToInsertStore = true;
     else {
       Value *Object = GetUnderlyingObject(SomePtr, MDL);
       SafeToInsertStore =
-        (isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) && 
-        !PointerMayBeCaptured(Object, true, true);
+          (isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) &&
+          !PointerMayBeCaptured(Object, true, true);
     }
   }
 
@@ -1252,9 +1414,11 @@ bool llvm::promoteLoopAccessesToScalars(
   // Otherwise, this is safe to promote, lets do it!
   DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " << *SomePtr
                << '\n');
-  ORE->emit(
-      OptimizationRemark(DEBUG_TYPE, "PromoteLoopAccessesToScalar", LoopUses[0])
-      << "Moving accesses to memory location out of the loop");
+  ORE->emit([&]() {
+    return OptimizationRemark(DEBUG_TYPE, "PromoteLoopAccessesToScalar",
+                              LoopUses[0])
+           << "Moving accesses to memory location out of the loop";
+  });
   ++NumPromoted;
 
   // Grab a debug location for the inserted loads/stores; given that the
@@ -1333,7 +1497,7 @@ LoopInvariantCodeMotion::collectAliasInfoForLoop(Loop *L, LoopInfo *LI,
   auto mergeLoop = [&](Loop *L) {
     // Loop over the body of this loop, looking for calls, invokes, and stores.
     for (BasicBlock *BB : L->blocks())
-        CurAST->add(*BB);          // Incorporate the specified basic block
+      CurAST->add(*BB); // Incorporate the specified basic block
   };
 
   // Add everything from the sub loops that are no longer directly available.
diff --git a/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/lib/Transforms/Scalar/LoopDataPrefetch.cpp
index d09af32a99fd..7f7c6de76450 100644
--- a/lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ b/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -18,25 +18,20 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 using namespace llvm;
 
@@ -120,9 +115,7 @@ public:
     AU.addPreserved<LoopInfoWrapperPass>();
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
     AU.addRequired<ScalarEvolutionWrapperPass>();
-    // FIXME: For some reason, preserving SE here breaks LSR (even if
-    // this pass changes nothing).
-    // AU.addPreserved<ScalarEvolutionWrapperPass>();
+    AU.addPreserved<ScalarEvolutionWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
   }
 
@@ -329,8 +322,10 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
       ++NumPrefetches;
       DEBUG(dbgs() << "  Access: " << *PtrValue << ", SCEV: " << *LSCEV
                    << "\n");
-      ORE->emit(OptimizationRemark(DEBUG_TYPE, "Prefetched", MemI)
-                << "prefetched memory access");
+      ORE->emit([&]() {
+        return OptimizationRemark(DEBUG_TYPE, "Prefetched", MemI)
+               << "prefetched memory access";
+      });
 
       MadeChange = true;
     }
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
index ac4dd44a0e90..82604a8842bf 100644
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -30,20 +30,12 @@ using namespace llvm;
 
 STATISTIC(NumDeleted, "Number of loops deleted");
 
-/// This function deletes dead loops. The caller of this function needs to
-/// guarantee that the loop is infact dead. Here we handle two kinds of dead
-/// loop. The first kind (\p isLoopDead) is where only invariant values from
-/// within the loop are used outside of it. The second kind (\p
-/// isLoopNeverExecuted) is where the loop is provably never executed. We can
-/// always remove never executed loops since they will not cause any difference
-/// to program behaviour.
-/// 
-/// This also updates the relevant analysis information in \p DT, \p SE, and \p
-/// LI. It also updates the loop PM if an updater struct is provided.
-// TODO: This function will be used by loop-simplifyCFG as well. So, move this
-// to LoopUtils.cpp
-static void deleteDeadLoop(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
-                           LoopInfo &LI, LPMUpdater *Updater = nullptr);
+enum class LoopDeletionResult {
+  Unmodified,
+  Modified,
+  Deleted,
+};
+
 /// Determines if a loop is dead.
 ///
 /// This assumes that we've already checked for unique exit and exiting blocks,
@@ -144,8 +136,8 @@ static bool isLoopNeverExecuted(Loop *L) {
 /// \returns true if any changes were made. This may mutate the loop even if it
 /// is unable to delete it due to hoisting trivially loop invariant
 /// instructions out of the loop.
-static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
-                             LoopInfo &LI, LPMUpdater *Updater = nullptr) {
+static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
+                                           ScalarEvolution &SE, LoopInfo &LI) {
   assert(L->isLCSSAForm(DT) && "Expected LCSSA!");
 
   // We can only remove the loop if there is a preheader that we can branch from
@@ -155,13 +147,13 @@ static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
   if (!Preheader || !L->hasDedicatedExits()) {
     DEBUG(dbgs()
           << "Deletion requires Loop with preheader and dedicated exits.\n");
-    return false;
+    return LoopDeletionResult::Unmodified;
   }
   // We can't remove loops that contain subloops.  If the subloops were dead,
   // they would already have been removed in earlier executions of this pass.
   if (L->begin() != L->end()) {
     DEBUG(dbgs() << "Loop contains subloops.\n");
-    return false;
+    return LoopDeletionResult::Unmodified;
   }
 
 
@@ -176,9 +168,9 @@ static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
         P->setIncomingValue(i, UndefValue::get(P->getType()));
       BI++;
     }
-    deleteDeadLoop(L, DT, SE, LI, Updater);
+    deleteDeadLoop(L, &DT, &SE, &LI);
     ++NumDeleted;
-    return true;
+    return LoopDeletionResult::Deleted;
   }
 
   // The remaining checks below are for a loop being dead because all statements
@@ -192,13 +184,14 @@ static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
   // a loop invariant manner.
   if (!ExitBlock) {
     DEBUG(dbgs() << "Deletion requires single exit block\n");
-    return false;
+    return LoopDeletionResult::Unmodified;
   }
   // Finally, we have to check that the loop really is dead.
   bool Changed = false;
   if (!isLoopDead(L, SE, ExitingBlocks, ExitBlock, Changed, Preheader)) {
     DEBUG(dbgs() << "Loop is not invariant, cannot delete.\n");
-    return Changed;
+    return Changed ? LoopDeletionResult::Modified
+                   : LoopDeletionResult::Unmodified;
   }
 
   // Don't remove loops for which we can't solve the trip count.
@@ -206,114 +199,15 @@ static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
   const SCEV *S = SE.getMaxBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(S)) {
     DEBUG(dbgs() << "Could not compute SCEV MaxBackedgeTakenCount.\n");
-    return Changed;
+    return Changed ? LoopDeletionResult::Modified
+                   : LoopDeletionResult::Unmodified;
   }
 
   DEBUG(dbgs() << "Loop is invariant, delete it!");
-  deleteDeadLoop(L, DT, SE, LI, Updater);
+  deleteDeadLoop(L, &DT, &SE, &LI);
   ++NumDeleted;
 
-  return true;
-}
-
-static void deleteDeadLoop(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
-                           LoopInfo &LI, LPMUpdater *Updater) {
-  assert(L->isLCSSAForm(DT) && "Expected LCSSA!");
-  auto *Preheader = L->getLoopPreheader();
-  assert(Preheader && "Preheader should exist!");
-
-  // Now that we know the removal is safe, remove the loop by changing the
-  // branch from the preheader to go to the single exit block.
-  //
-  // Because we're deleting a large chunk of code at once, the sequence in which
-  // we remove things is very important to avoid invalidation issues.
-
-  // If we have an LPM updater, tell it about the loop being removed.
-  if (Updater)
-    Updater->markLoopAsDeleted(*L);
-
-  // Tell ScalarEvolution that the loop is deleted. Do this before
-  // deleting the loop so that ScalarEvolution can look at the loop
-  // to determine what it needs to clean up.
-  SE.forgetLoop(L);
-
-  auto *ExitBlock = L->getUniqueExitBlock();
-  assert(ExitBlock && "Should have a unique exit block!");
-
-  assert(L->hasDedicatedExits() && "Loop should have dedicated exits!");
-
-  // Connect the preheader directly to the exit block.
-  // Even when the loop is never executed, we cannot remove the edge from the
-  // source block to the exit block. Consider the case where the unexecuted loop
-  // branches back to an outer loop. If we deleted the loop and removed the edge
-  // coming to this inner loop, this will break the outer loop structure (by
-  // deleting the backedge of the outer loop). If the outer loop is indeed a
-  // non-loop, it will be deleted in a future iteration of loop deletion pass.
-  Preheader->getTerminator()->replaceUsesOfWith(L->getHeader(), ExitBlock);
-
-  // Rewrite phis in the exit block to get their inputs from the Preheader
-  // instead of the exiting block.
-  BasicBlock::iterator BI = ExitBlock->begin();
-  while (PHINode *P = dyn_cast<PHINode>(BI)) {
-    // Set the zero'th element of Phi to be from the preheader and remove all
-    // other incoming values. Given the loop has dedicated exits, all other
-    // incoming values must be from the exiting blocks.
-    int PredIndex = 0;
-    P->setIncomingBlock(PredIndex, Preheader);
-    // Removes all incoming values from all other exiting blocks (including
-    // duplicate values from an exiting block).
-    // Nuke all entries except the zero'th entry which is the preheader entry.
-    // NOTE! We need to remove Incoming Values in the reverse order as done
-    // below, to keep the indices valid for deletion (removeIncomingValues
-    // updates getNumIncomingValues and shifts all values down into the operand
-    // being deleted).
-    for (unsigned i = 0, e = P->getNumIncomingValues() - 1; i != e; ++i)
-      P->removeIncomingValue(e-i, false);
-
-    assert((P->getNumIncomingValues() == 1 &&
-            P->getIncomingBlock(PredIndex) == Preheader) &&
-           "Should have exactly one value and that's from the preheader!");
-    ++BI;
-  }
-
-  // Update the dominator tree and remove the instructions and blocks that will
-  // be deleted from the reference counting scheme.
-  SmallVector<DomTreeNode*, 8> ChildNodes;
-  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
-       LI != LE; ++LI) {
-    // Move all of the block's children to be children of the Preheader, which
-    // allows us to remove the domtree entry for the block.
-    ChildNodes.insert(ChildNodes.begin(), DT[*LI]->begin(), DT[*LI]->end());
-    for (DomTreeNode *ChildNode : ChildNodes) {
-      DT.changeImmediateDominator(ChildNode, DT[Preheader]);
-    }
-
-    ChildNodes.clear();
-    DT.eraseNode(*LI);
-
-    // Remove the block from the reference counting scheme, so that we can
-    // delete it freely later.
-    (*LI)->dropAllReferences();
-  }
-
-  // Erase the instructions and the blocks without having to worry
-  // about ordering because we already dropped the references.
-  // NOTE: This iteration is safe because erasing the block does not remove its
-  // entry from the loop's block list.  We do that in the next section.
-  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
-       LI != LE; ++LI)
-    (*LI)->eraseFromParent();
-
-  // Finally, the blocks from loopinfo.  This has to happen late because
-  // otherwise our loop iterators won't work.
-
-  SmallPtrSet<BasicBlock *, 8> blocks;
-  blocks.insert(L->block_begin(), L->block_end());
-  for (BasicBlock *BB : blocks)
-    LI.removeBlock(BB);
-
-  // The last step is to update LoopInfo now that we've eliminated this loop.
-  LI.markAsRemoved(L);
+  return LoopDeletionResult::Deleted;
 }
 
 PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM,
@@ -322,9 +216,14 @@ PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM,
 
   DEBUG(dbgs() << "Analyzing Loop for deletion: ");
   DEBUG(L.dump());
-  if (!deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI, &Updater))
+  std::string LoopName = L.getName();
+  auto Result = deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI);
+  if (Result == LoopDeletionResult::Unmodified)
     return PreservedAnalyses::all();
 
+  if (Result == LoopDeletionResult::Deleted)
+    Updater.markLoopAsDeleted(L, LoopName);
+
   return getLoopPassPreservedAnalyses();
 }
 
@@ -354,7 +253,7 @@ INITIALIZE_PASS_END(LoopDeletionLegacyPass, "loop-deletion",
 
 Pass *llvm::createLoopDeletionPass() { return new LoopDeletionLegacyPass(); }
 
-bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &) {
+bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (skipLoop(L))
     return false;
   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -363,5 +262,11 @@ bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &) {
 
   DEBUG(dbgs() << "Analyzing Loop for deletion: ");
   DEBUG(L->dump());
-  return deleteLoopIfDead(L, DT, SE, LI);
+
+  LoopDeletionResult Result = deleteLoopIfDead(L, DT, SE, LI);
+
+  if (Result == LoopDeletionResult::Deleted)
+    LPM.markLoopAsDeleted(*L);
+
+  return Result != LoopDeletionResult::Unmodified;
 }
diff --git a/lib/Transforms/Scalar/LoopDistribute.cpp b/lib/Transforms/Scalar/LoopDistribute.cpp
index 3624bba10345..0d7e3db901cb 100644
--- a/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -23,32 +23,61 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/LoopDistribute.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <cassert>
+#include <functional>
 #include <list>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
 
 #define LDIST_NAME "loop-distribute"
 #define DEBUG_TYPE LDIST_NAME
 
-using namespace llvm;
-
 static cl::opt<bool>
     LDistVerify("loop-distribute-verify", cl::Hidden,
                 cl::desc("Turn on DominatorTree and LoopInfo verification "
@@ -81,14 +110,15 @@ static cl::opt<bool> EnableLoopDistribute(
 STATISTIC(NumLoopsDistributed, "Number of loops distributed");
 
 namespace {
+
 /// \brief Maintains the set of instructions of the loop for a partition before
 /// cloning.  After cloning, it hosts the new loop.
 class InstPartition {
-  typedef SmallPtrSet<Instruction *, 8> InstructionSet;
+  using InstructionSet = SmallPtrSet<Instruction *, 8>;
 
 public:
   InstPartition(Instruction *I, Loop *L, bool DepCycle = false)
-      : DepCycle(DepCycle), OrigLoop(L), ClonedLoop(nullptr) {
+      : DepCycle(DepCycle), OrigLoop(L) {
     Set.insert(I);
   }
 
@@ -220,7 +250,7 @@ private:
 
   /// \brief The cloned loop.  If this partition is mapped to the original loop,
   /// this is null.
-  Loop *ClonedLoop;
+  Loop *ClonedLoop = nullptr;
 
   /// \brief The blocks of ClonedLoop including the preheader.  If this
   /// partition is mapped to the original loop, this is empty.
@@ -235,7 +265,7 @@ private:
 /// \brief Holds the set of Partitions.  It populates them, merges them and then
 /// clones the loops.
 class InstPartitionContainer {
-  typedef DenseMap<Instruction *, int> InstToPartitionIdT;
+  using InstToPartitionIdT = DenseMap<Instruction *, int>;
 
 public:
   InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT)
@@ -308,8 +338,8 @@ public:
   ///
   /// Return if any partitions were merged.
   bool mergeToAvoidDuplicatedLoads() {
-    typedef DenseMap<Instruction *, InstPartition *> LoadToPartitionT;
-    typedef EquivalenceClasses<InstPartition *> ToBeMergedT;
+    using LoadToPartitionT = DenseMap<Instruction *, InstPartition *>;
+    using ToBeMergedT = EquivalenceClasses<InstPartition *>;
 
     LoadToPartitionT LoadToPartition;
     ToBeMergedT ToBeMerged;
@@ -511,7 +541,7 @@ public:
   }
 
 private:
-  typedef std::list<InstPartition> PartitionContainerT;
+  using PartitionContainerT = std::list<InstPartition>;
 
   /// \brief List of partitions.
   PartitionContainerT PartitionContainer;
@@ -552,17 +582,17 @@ private:
 /// By traversing the memory instructions in program order and accumulating this
 /// number, we know whether any unsafe dependence crosses over a program point.
 class MemoryInstructionDependences {
-  typedef MemoryDepChecker::Dependence Dependence;
+  using Dependence = MemoryDepChecker::Dependence;
 
 public:
   struct Entry {
     Instruction *Inst;
-    unsigned NumUnsafeDependencesStartOrEnd;
+    unsigned NumUnsafeDependencesStartOrEnd = 0;
 
-    Entry(Instruction *Inst) : Inst(Inst), NumUnsafeDependencesStartOrEnd(0) {}
+    Entry(Instruction *Inst) : Inst(Inst) {}
   };
 
-  typedef SmallVector<Entry, 8> AccessesType;
+  using AccessesType = SmallVector<Entry, 8>;
 
   AccessesType::const_iterator begin() const { return Accesses.begin(); }
   AccessesType::const_iterator end() const { return Accesses.end(); }
@@ -594,7 +624,7 @@ class LoopDistributeForLoop {
 public:
   LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT,
                         ScalarEvolution *SE, OptimizationRemarkEmitter *ORE)
-      : L(L), F(F), LI(LI), LAI(nullptr), DT(DT), SE(SE), ORE(ORE) {
+      : L(L), F(F), LI(LI), DT(DT), SE(SE), ORE(ORE) {
     setForced();
   }
 
@@ -755,9 +785,11 @@ public:
 
     ++NumLoopsDistributed;
     // Report the success.
-    ORE->emit(OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(),
-                                 L->getHeader())
-              << "distributed loop");
+    ORE->emit([&]() {
+      return OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(),
+                                L->getHeader())
+             << "distributed loop";
+    });
     return true;
   }
 
@@ -769,11 +801,13 @@ public:
     DEBUG(dbgs() << "Skipping; " << Message << "\n");
 
     // With Rpass-missed report that distribution failed.
-    ORE->emit(
-        OptimizationRemarkMissed(LDIST_NAME, "NotDistributed", L->getStartLoc(),
-                                 L->getHeader())
-        << "loop not distributed: use -Rpass-analysis=loop-distribute for more "
-           "info");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(LDIST_NAME, "NotDistributed",
+                                      L->getStartLoc(), L->getHeader())
+             << "loop not distributed: use -Rpass-analysis=loop-distribute for "
+                "more "
+                "info";
+    });
 
     // With Rpass-analysis report why.  This is on by default if distribution
     // was requested explicitly.
@@ -857,7 +891,7 @@ private:
 
   // Analyses used.
   LoopInfo *LI;
-  const LoopAccessInfo *LAI;
+  const LoopAccessInfo *LAI = nullptr;
   DominatorTree *DT;
   ScalarEvolution *SE;
   OptimizationRemarkEmitter *ORE;
@@ -871,6 +905,8 @@ private:
   Optional<bool> IsForced;
 };
 
+} // end anonymous namespace
+
 /// Shared implementation between new and old PMs.
 static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT,
                     ScalarEvolution *SE, OptimizationRemarkEmitter *ORE,
@@ -901,9 +937,13 @@ static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT,
   return Changed;
 }
 
+namespace {
+
 /// \brief The pass class.
 class LoopDistributeLegacy : public FunctionPass {
 public:
+  static char ID;
+
   LoopDistributeLegacy() : FunctionPass(ID) {
     // The default is set by the caller.
     initializeLoopDistributeLegacyPass(*PassRegistry::getPassRegistry());
@@ -934,10 +974,9 @@ public:
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
-
-  static char ID;
 };
-} // anonymous namespace
+
+} // end anonymous namespace
 
 PreservedAnalyses LoopDistributePass::run(Function &F,
                                           FunctionAnalysisManager &AM) {
@@ -956,7 +995,7 @@ PreservedAnalyses LoopDistributePass::run(Function &F,
   auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
   std::function<const LoopAccessInfo &(Loop &)> GetLAA =
       [&](Loop &L) -> const LoopAccessInfo & {
-    LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI};
+    LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, nullptr};
     return LAM.getResult<LoopAccessAnalysis>(L, AR);
   };
 
@@ -971,6 +1010,7 @@ PreservedAnalyses LoopDistributePass::run(Function &F,
 }
 
 char LoopDistributeLegacy::ID;
+
 static const char ldist_name[] = "Loop Distribution";
 
 INITIALIZE_PASS_BEGIN(LoopDistributeLegacy, LDIST_NAME, ldist_name, false,
@@ -982,6 +1022,4 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 INITIALIZE_PASS_END(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, false)
 
-namespace llvm {
-FunctionPass *createLoopDistributePass() { return new LoopDistributeLegacy(); }
-}
+FunctionPass *llvm::createLoopDistributePass() { return new LoopDistributeLegacy(); }
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 4a6a35c0ab1b..21551f0a0825 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -1,4 +1,4 @@
-//===-- LoopIdiomRecognize.cpp - Loop idiom recognition -------------------===//
+//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -38,32 +38,64 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "loop-idiom"
@@ -80,7 +112,7 @@ static cl::opt<bool> UseLIRCodeSizeHeurs(
 namespace {
 
 class LoopIdiomRecognize {
-  Loop *CurLoop;
+  Loop *CurLoop = nullptr;
   AliasAnalysis *AA;
   DominatorTree *DT;
   LoopInfo *LI;
@@ -96,20 +128,21 @@ public:
                               TargetLibraryInfo *TLI,
                               const TargetTransformInfo *TTI,
                               const DataLayout *DL)
-      : CurLoop(nullptr), AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI),
-        DL(DL) {}
+      : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL) {}
 
   bool runOnLoop(Loop *L);
 
 private:
-  typedef SmallVector<StoreInst *, 8> StoreList;
-  typedef MapVector<Value *, StoreList> StoreListMap;
+  using StoreList = SmallVector<StoreInst *, 8>;
+  using StoreListMap = MapVector<Value *, StoreList>;
+
   StoreListMap StoreRefsForMemset;
   StoreListMap StoreRefsForMemsetPattern;
   StoreList StoreRefsForMemcpy;
   bool HasMemset;
   bool HasMemsetPattern;
   bool HasMemcpy;
+
   /// Return code for isLegalStore()
   enum LegalStoreKind {
     None = 0,
@@ -164,6 +197,7 @@ private:
 class LoopIdiomRecognizeLegacyPass : public LoopPass {
 public:
   static char ID;
+
   explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
     initializeLoopIdiomRecognizeLegacyPassPass(
         *PassRegistry::getPassRegistry());
@@ -190,14 +224,16 @@ public:
 
   /// This transformation requires natural loop information & requires that
   /// loop preheaders be inserted into the CFG.
-  ///
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
     getLoopAnalysisUsage(AU);
   }
 };
-} // End anonymous namespace.
+
+} // end anonymous namespace
+
+char LoopIdiomRecognizeLegacyPass::ID = 0;
 
 PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
                                               LoopStandardAnalysisResults &AR,
@@ -211,7 +247,6 @@ PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
   return getLoopPassPreservedAnalyses();
 }
 
-char LoopIdiomRecognizeLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",
                       "Recognize loop idioms", false, false)
 INITIALIZE_PASS_DEPENDENCY(LoopPass)
@@ -299,13 +334,6 @@ bool LoopIdiomRecognize::runOnCountableLoop() {
   return MadeChange;
 }
 
-static unsigned getStoreSizeInBytes(StoreInst *SI, const DataLayout *DL) {
-  uint64_t SizeInBits = DL->getTypeSizeInBits(SI->getValueOperand()->getType());
-  assert(((SizeInBits & 7) || (SizeInBits >> 32) == 0) &&
-         "Don't overflow unsigned.");
-  return (unsigned)SizeInBits >> 3;
-}
-
 static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
   const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
   return ConstStride->getAPInt();
@@ -354,7 +382,6 @@ static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
 
 LoopIdiomRecognize::LegalStoreKind
 LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
-
   // Don't touch volatile stores.
   if (SI->isVolatile())
     return LegalStoreKind::None;
@@ -424,7 +451,7 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
     // Check to see if the stride matches the size of the store.  If so, then we
     // know that every byte is touched in the loop.
     APInt Stride = getStoreStride(StoreEv);
-    unsigned StoreSize = getStoreSizeInBytes(SI, DL);
+    unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
     if (StoreSize != Stride && StoreSize != -Stride)
       return LegalStoreKind::None;
 
@@ -563,7 +590,7 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
     const SCEVAddRecExpr *FirstStoreEv =
         cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
     APInt FirstStride = getStoreStride(FirstStoreEv);
-    unsigned FirstStoreSize = getStoreSizeInBytes(SL[i], DL);
+    unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
 
     // See if we can optimize just this store in isolation.
     if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
@@ -656,7 +683,7 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
         break;
       AdjacentStores.insert(I);
 
-      StoreSize += getStoreSizeInBytes(I, DL);
+      StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
       // Move to the next value in the chain.
       I = ConsecutiveChain[I];
     }
@@ -761,7 +788,8 @@ mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
        ++BI)
     for (Instruction &I : **BI)
       if (IgnoredStores.count(&I) == 0 &&
-          (AA.getModRefInfo(&I, StoreLoc) & Access))
+          isModOrRefSet(
+              intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
         return true;
 
   return false;
@@ -780,6 +808,41 @@ static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
   return SE->getMinusSCEV(Start, Index);
 }
 
+/// Compute the number of bytes as a SCEV from the backedge taken count.
+///
+/// This also maps the SCEV into the provided type and tries to handle the
+/// computation in a way that will fold cleanly.
+static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
+                               unsigned StoreSize, Loop *CurLoop,
+                               const DataLayout *DL, ScalarEvolution *SE) {
+  const SCEV *NumBytesS;
+  // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
+  // pointer size if it isn't already.
+  //
+  // If we're going to need to zero extend the BE count, check if we can add
+  // one to it prior to zero extending without overflow. Provided this is safe,
+  // it allows better simplification of the +1.
+  if (DL->getTypeSizeInBits(BECount->getType()) <
+          DL->getTypeSizeInBits(IntPtr) &&
+      SE->isLoopEntryGuardedByCond(
+          CurLoop, ICmpInst::ICMP_NE, BECount,
+          SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
+    NumBytesS = SE->getZeroExtendExpr(
+        SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
+        IntPtr);
+  } else {
+    NumBytesS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
+                               SE->getOne(IntPtr), SCEV::FlagNUW);
+  }
+
+  // And scale it based on the store size.
+  if (StoreSize != 1) {
+    NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
+                               SCEV::FlagNUW);
+  }
+  return NumBytesS;
+}
+
 /// processLoopStridedStore - We see a strided store of some value.  If we can
 /// transform this into a memset or memset_pattern in the loop preheader, do so.
 bool LoopIdiomRecognize::processLoopStridedStore(
@@ -824,8 +887,8 @@ bool LoopIdiomRecognize::processLoopStridedStore(
   // base pointer and checking the region.
   Value *BasePtr =
       Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
-  if (mayLoopAccessLocation(BasePtr, MRI_ModRef, CurLoop, BECount, StoreSize,
-                            *AA, Stores)) {
+  if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
+                            StoreSize, *AA, Stores)) {
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
     RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI);
@@ -837,16 +900,8 @@ bool LoopIdiomRecognize::processLoopStridedStore(
 
   // Okay, everything looks good, insert the memset.
 
-  // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
-  // pointer size if it isn't already.
-  BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr);
-
   const SCEV *NumBytesS =
-      SE->getAddExpr(BECount, SE->getOne(IntPtr), SCEV::FlagNUW);
-  if (StoreSize != 1) {
-    NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
-                               SCEV::FlagNUW);
-  }
+      getNumBytes(BECount, IntPtr, StoreSize, CurLoop, DL, SE);
 
   // TODO: ideally we should still be able to generate memset if SCEV expander
   // is taught to generate the dependencies at the latest point.
@@ -903,7 +958,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   Value *StorePtr = SI->getPointerOperand();
   const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
   APInt Stride = getStoreStride(StoreEv);
-  unsigned StoreSize = getStoreSizeInBytes(SI, DL);
+  unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
   bool NegStride = StoreSize == -Stride;
 
   // The store must be feeding a non-volatile load.
@@ -942,7 +997,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
 
   SmallPtrSet<Instruction *, 1> Stores;
   Stores.insert(SI);
-  if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount,
+  if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
                             StoreSize, *AA, Stores)) {
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
@@ -962,8 +1017,8 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   Value *LoadBasePtr = Expander.expandCodeFor(
       LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
 
-  if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize,
-                            *AA, Stores)) {
+  if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
+                            StoreSize, *AA, Stores)) {
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
     RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI);
@@ -976,16 +1031,8 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
 
   // Okay, everything is safe, we can transform this!
 
-  // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
-  // pointer size if it isn't already.
-  BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy);
-
   const SCEV *NumBytesS =
-      SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW);
-
-  if (StoreSize != 1)
-    NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),
-                               SCEV::FlagNUW);
+      getNumBytes(BECount, IntPtrTy, StoreSize, CurLoop, DL, SE);
 
   Value *NumBytes =
       Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
@@ -1010,16 +1057,12 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
     if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
       return false;
 
+    // Create the call.
+    // Note that unordered atomic loads/stores are *required* by the spec to
+    // have an alignment but non-atomic loads/stores may not.
     NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
-        StoreBasePtr, LoadBasePtr, NumBytes, StoreSize);
-
-    // Propagate alignment info onto the pointer args. Note that unordered
-    // atomic loads/stores are *required* by the spec to have an alignment
-    // but non-atomic loads/stores may not.
-    NewCall->addParamAttr(0, Attribute::getWithAlignment(NewCall->getContext(),
-                                                         SI->getAlignment()));
-    NewCall->addParamAttr(1, Attribute::getWithAlignment(NewCall->getContext(),
-                                                         LI->getAlignment()));
+        StoreBasePtr, SI->getAlignment(), LoadBasePtr, LI->getAlignment(),
+        NumBytes, StoreSize);
   }
   NewCall->setDebugLoc(SI->getDebugLoc());
 
@@ -1273,9 +1316,9 @@ static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX,
   // step 2: detect instructions corresponding to "x.next = x >> 1"
   if (!DefX || DefX->getOpcode() != Instruction::AShr)
     return false;
-  if (ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1)))
-    if (!Shft || !Shft->isOne())
-      return false;
+  ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
+  if (!Shft || !Shft->isOne())
+    return false;
   VarX = DefX->getOperand(0);
 
   // step 3: Check the recurrence of variable X
@@ -1469,7 +1512,7 @@ static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
 ///   PhiX = PHI [InitX, DefX]
 ///   CntInst = CntPhi + 1
 ///   DefX = PhiX >> 1
-//    LOOP_BODY
+///   LOOP_BODY
 ///   Br: loop if (DefX != 0)
 /// Use(CntPhi) or Use(CntInst)
 ///
diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp
index af095560cc02..40d468a084d4 100644
--- a/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -12,22 +12,33 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/LoopInstSimplify.h"
+#include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/User.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include <algorithm>
+#include <utility>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "loop-instsimplify"
@@ -45,7 +56,7 @@ static bool SimplifyLoopInst(Loop *L, DominatorTree *DT, LoopInfo *LI,
 
   // The bit we are stealing from the pointer represents whether this basic
   // block is the header of a subloop, in which case we only process its phis.
-  typedef PointerIntPair<BasicBlock *, 1> WorklistItem;
+  using WorklistItem = PointerIntPair<BasicBlock *, 1>;
   SmallVector<WorklistItem, 16> VisitStack;
   SmallPtrSet<BasicBlock *, 32> Visited;
 
@@ -151,9 +162,11 @@ static bool SimplifyLoopInst(Loop *L, DominatorTree *DT, LoopInfo *LI,
 }
 
 namespace {
+
 class LoopInstSimplifyLegacyPass : public LoopPass {
 public:
   static char ID; // Pass ID, replacement for typeid
+
   LoopInstSimplifyLegacyPass() : LoopPass(ID) {
     initializeLoopInstSimplifyLegacyPassPass(*PassRegistry::getPassRegistry());
   }
@@ -181,7 +194,8 @@ public:
     getLoopAnalysisUsage(AU);
   }
 };
-}
+
+} // end anonymous namespace
 
 PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
                                             LoopStandardAnalysisResults &AR,
@@ -195,6 +209,7 @@ PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
 }
 
 char LoopInstSimplifyLegacyPass::ID = 0;
+
 INITIALIZE_PASS_BEGIN(LoopInstSimplifyLegacyPass, "loop-instsimplify",
                       "Simplify instructions in loops", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
diff --git a/lib/Transforms/Scalar/LoopInterchange.cpp b/lib/Transforms/Scalar/LoopInterchange.cpp
index 2e0d8e0374c0..4f8dafef230a 100644
--- a/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -1,4 +1,4 @@
-//===- LoopInterchange.cpp - Loop interchange pass------------------------===//
+//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,33 +13,38 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopIterator.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include <cassert>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -51,10 +56,12 @@ static cl::opt<int> LoopInterchangeCostThreshold(
 
 namespace {
 
-typedef SmallVector<Loop *, 8> LoopVector;
+using LoopVector = SmallVector<Loop *, 8>;
 
 // TODO: Check if we can use a sparse matrix here.
-typedef std::vector<std::vector<char>> CharMatrix;
+using CharMatrix = std::vector<std::vector<char>>;
+
+} // end anonymous namespace
 
 // Maximum number of dependencies that can be handled in the dependency matrix.
 static const unsigned MaxMemInstrCount = 100;
@@ -62,14 +69,11 @@ static const unsigned MaxMemInstrCount = 100;
 // Maximum loop depth supported.
 static const unsigned MaxLoopNestDepth = 10;
 
-struct LoopInterchange;
-
 #ifdef DUMP_DEP_MATRICIES
-void printDepMatrix(CharMatrix &DepMatrix) {
-  for (auto I = DepMatrix.begin(), E = DepMatrix.end(); I != E; ++I) {
-    std::vector<char> Vec = *I;
-    for (auto II = Vec.begin(), EE = Vec.end(); II != EE; ++II)
-      DEBUG(dbgs() << *II << " ");
+static void printDepMatrix(CharMatrix &DepMatrix) {
+  for (auto &Row : DepMatrix) {
+    for (auto D : Row)
+      DEBUG(dbgs() << D << " ");
     DEBUG(dbgs() << "\n");
   }
 }
@@ -77,25 +81,24 @@ void printDepMatrix(CharMatrix &DepMatrix) {
 
 static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
                                      Loop *L, DependenceInfo *DI) {
-  typedef SmallVector<Value *, 16> ValueVector;
+  using ValueVector = SmallVector<Value *, 16>;
+
   ValueVector MemInstr;
 
   // For each block.
-  for (Loop::block_iterator BB = L->block_begin(), BE = L->block_end();
-       BB != BE; ++BB) {
+  for (BasicBlock *BB : L->blocks()) {
     // Scan the BB and collect legal loads and stores.
-    for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E;
-         ++I) {
+    for (Instruction &I : *BB) {
       if (!isa<Instruction>(I))
         return false;
-      if (LoadInst *Ld = dyn_cast<LoadInst>(I)) {
+      if (auto *Ld = dyn_cast<LoadInst>(&I)) {
         if (!Ld->isSimple())
           return false;
-        MemInstr.push_back(&*I);
-      } else if (StoreInst *St = dyn_cast<StoreInst>(I)) {
+        MemInstr.push_back(&I);
+      } else if (auto *St = dyn_cast<StoreInst>(&I)) {
         if (!St->isSimple())
           return false;
-        MemInstr.push_back(&*I);
+        MemInstr.push_back(&I);
       }
     }
   }
@@ -171,7 +174,7 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
   }
 
   // We don't have a DepMatrix to check legality return false.
-  if (DepMatrix.size() == 0)
+  if (DepMatrix.empty())
     return false;
   return true;
 }
@@ -216,7 +219,6 @@ static bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row,
 static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row,
                                 unsigned OuterLoopId, char InnerDep,
                                 char OuterDep) {
-
   if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId))
     return false;
 
@@ -255,7 +257,6 @@ static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row,
 static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
                                       unsigned InnerLoopId,
                                       unsigned OuterLoopId) {
-
   unsigned NumRows = DepMatrix.size();
   // For each row check if it is valid to interchange.
   for (unsigned Row = 0; Row < NumRows; ++Row) {
@@ -270,7 +271,6 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
 }
 
 static void populateWorklist(Loop &L, SmallVector<LoopVector, 8> &V) {
-
   DEBUG(dbgs() << "Calling populateWorklist on Func: "
                << L.getHeader()->getParent()->getName() << " Loop: %"
                << L.getHeader()->getName() << '\n');
@@ -320,6 +320,8 @@ static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) {
   return nullptr;
 }
 
+namespace {
+
 /// LoopInterchangeLegality checks if it is legal to interchange the loop.
 class LoopInterchangeLegality {
 public:
@@ -327,11 +329,12 @@ public:
                           LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA,
                           OptimizationRemarkEmitter *ORE)
       : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
-        PreserveLCSSA(PreserveLCSSA), ORE(ORE), InnerLoopHasReduction(false) {}
+        PreserveLCSSA(PreserveLCSSA), ORE(ORE) {}
 
   /// Check if the loops can be interchanged.
   bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
                            CharMatrix &DepMatrix);
+
   /// Check if the loop structure is understood. We do not handle triangular
   /// loops for now.
   bool isLoopStructureUnderstood(PHINode *InnerInductionVar);
@@ -348,6 +351,7 @@ private:
   bool findInductionAndReductions(Loop *L,
                                   SmallVector<PHINode *, 8> &Inductions,
                                   SmallVector<PHINode *, 8> &Reductions);
+
   Loop *OuterLoop;
   Loop *InnerLoop;
 
@@ -355,10 +359,11 @@ private:
   LoopInfo *LI;
   DominatorTree *DT;
   bool PreserveLCSSA;
+
   /// Interface to emit optimization remarks.
   OptimizationRemarkEmitter *ORE;
 
-  bool InnerLoopHasReduction;
+  bool InnerLoopHasReduction = false;
 };
 
 /// LoopInterchangeProfitability checks if it is profitable to interchange the
@@ -381,6 +386,7 @@ private:
 
   /// Scev analysis.
   ScalarEvolution *SE;
+
   /// Interface to emit optimization remarks.
   OptimizationRemarkEmitter *ORE;
 };
@@ -415,6 +421,7 @@ private:
 
   /// Scev analysis.
   ScalarEvolution *SE;
+
   LoopInfo *LI;
   DominatorTree *DT;
   BasicBlock *LoopExit;
@@ -424,16 +431,16 @@ private:
 // Main LoopInterchange Pass.
 struct LoopInterchange : public FunctionPass {
   static char ID;
-  ScalarEvolution *SE;
-  LoopInfo *LI;
-  DependenceInfo *DI;
-  DominatorTree *DT;
+  ScalarEvolution *SE = nullptr;
+  LoopInfo *LI = nullptr;
+  DependenceInfo *DI = nullptr;
+  DominatorTree *DT = nullptr;
   bool PreserveLCSSA;
+
   /// Interface to emit optimization remarks.
   OptimizationRemarkEmitter *ORE;
 
-  LoopInterchange()
-      : FunctionPass(ID), SE(nullptr), LI(nullptr), DI(nullptr), DT(nullptr) {
+  LoopInterchange() : FunctionPass(ID) {
     initializeLoopInterchangePass(*PassRegistry::getPassRegistry());
   }
 
@@ -501,7 +508,6 @@ struct LoopInterchange : public FunctionPass {
   }
 
   bool processLoopList(LoopVector LoopList, Function &F) {
-
     bool Changed = false;
     unsigned LoopNestDepth = LoopList.size();
     if (LoopNestDepth < 2) {
@@ -580,7 +586,6 @@ struct LoopInterchange : public FunctionPass {
   bool processLoop(LoopVector LoopList, unsigned InnerLoopId,
                    unsigned OuterLoopId, BasicBlock *LoopNestExit,
                    std::vector<std::vector<char>> &DependencyMatrix) {
-
     DEBUG(dbgs() << "Processing Inner Loop Id = " << InnerLoopId
                  << " and OuterLoopId = " << OuterLoopId << "\n");
     Loop *InnerLoop = LoopList[InnerLoopId];
@@ -599,10 +604,12 @@ struct LoopInterchange : public FunctionPass {
       return false;
     }
 
-    ORE->emit(OptimizationRemark(DEBUG_TYPE, "Interchanged",
-                                 InnerLoop->getStartLoc(),
-                                 InnerLoop->getHeader())
-              << "Loop interchanged with enclosing loop.");
+    ORE->emit([&]() {
+      return OptimizationRemark(DEBUG_TYPE, "Interchanged",
+                                InnerLoop->getStartLoc(),
+                                InnerLoop->getHeader())
+             << "Loop interchanged with enclosing loop.";
+    });
 
     LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT,
                                  LoopNestExit, LIL.hasInnerLoopReduction());
@@ -612,9 +619,10 @@ struct LoopInterchange : public FunctionPass {
   }
 };
 
-} // end of namespace
+} // end anonymous namespace
+
 bool LoopInterchangeLegality::areAllUsesReductions(Instruction *Ins, Loop *L) {
-  return none_of(Ins->users(), [=](User *U) -> bool {
+  return llvm::none_of(Ins->users(), [=](User *U) -> bool {
     auto *UserIns = dyn_cast<PHINode>(U);
     RecurrenceDescriptor RD;
     return !UserIns || !RecurrenceDescriptor::isReductionPHI(UserIns, L, RD);
@@ -664,11 +672,9 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
   if (!OuterLoopHeaderBI)
     return false;
 
-  for (unsigned i = 0, e = OuterLoopHeaderBI->getNumSuccessors(); i < e; ++i) {
-    if (OuterLoopHeaderBI->getSuccessor(i) != InnerLoopPreHeader &&
-        OuterLoopHeaderBI->getSuccessor(i) != OuterLoopLatch)
+  for (BasicBlock *Succ : OuterLoopHeaderBI->successors())
+    if (Succ != InnerLoopPreHeader && Succ != OuterLoopLatch)
       return false;
-  }
 
   DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
   // We do not have any basic block in between now make sure the outer header
@@ -682,10 +688,8 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
   return true;
 }
 
-
 bool LoopInterchangeLegality::isLoopStructureUnderstood(
     PHINode *InnerInduction) {
-
   unsigned Num = InnerInduction->getNumOperands();
   BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
   for (unsigned i = 0; i < Num; ++i) {
@@ -750,12 +754,12 @@ static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) {
 static BasicBlock *getLoopLatchExitBlock(BasicBlock *LatchBlock,
                                          BasicBlock *LoopHeader) {
   if (BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator())) {
-    unsigned Num = BI->getNumSuccessors();
-    assert(Num == 2);
-    for (unsigned i = 0; i < Num; ++i) {
-      if (BI->getSuccessor(i) == LoopHeader)
+    assert(BI->getNumSuccessors() == 2 &&
+           "Branch leaving loop latch must have 2 successors");
+    for (BasicBlock *Succ : BI->successors()) {
+      if (Succ == LoopHeader)
         continue;
-      return BI->getSuccessor(i);
+      return Succ;
     }
   }
   return nullptr;
@@ -764,7 +768,6 @@ static BasicBlock *getLoopLatchExitBlock(BasicBlock *LatchBlock,
 // This function indicates the current limitations in the transform as a result
 // of which we do not proceed.
 bool LoopInterchangeLegality::currentLimitations() {
-
   BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
   BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
   BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
@@ -777,12 +780,13 @@ bool LoopInterchangeLegality::currentLimitations() {
   if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) {
     DEBUG(dbgs() << "Only inner loops with induction or reduction PHI nodes "
                  << "are supported currently.\n");
-    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
-                                       "UnsupportedPHIInner",
-                                       InnerLoop->getStartLoc(),
-                                       InnerLoop->getHeader())
-              << "Only inner loops with induction or reduction PHI nodes can be"
-                 " interchange currently.");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
+                                      InnerLoop->getStartLoc(),
+                                      InnerLoop->getHeader())
+             << "Only inner loops with induction or reduction PHI nodes can be"
+                " interchange currently.";
+    });
     return true;
   }
 
@@ -790,12 +794,13 @@ bool LoopInterchangeLegality::currentLimitations() {
   if (Inductions.size() != 1) {
     DEBUG(dbgs() << "We currently only support loops with 1 induction variable."
                  << "Failed to interchange due to current limitation\n");
-    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
-                                       "MultiInductionInner",
-                                       InnerLoop->getStartLoc(),
-                                       InnerLoop->getHeader())
-              << "Only inner loops with 1 induction variable can be "
-                 "interchanged currently.");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner",
+                                      InnerLoop->getStartLoc(),
+                                      InnerLoop->getHeader())
+             << "Only inner loops with 1 induction variable can be "
+                "interchanged currently.";
+    });
     return true;
   }
   if (Reductions.size() > 0)
@@ -806,12 +811,13 @@ bool LoopInterchangeLegality::currentLimitations() {
   if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) {
     DEBUG(dbgs() << "Only outer loops with induction or reduction PHI nodes "
                  << "are supported currently.\n");
-    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
-                                       "UnsupportedPHIOuter",
-                                       OuterLoop->getStartLoc(),
-                                       OuterLoop->getHeader())
-              << "Only outer loops with induction or reduction PHI nodes can be"
-                 " interchanged currently.");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
+                                      OuterLoop->getStartLoc(),
+                                      OuterLoop->getHeader())
+             << "Only outer loops with induction or reduction PHI nodes can be"
+                " interchanged currently.";
+    });
     return true;
   }
 
@@ -820,35 +826,38 @@ bool LoopInterchangeLegality::currentLimitations() {
   if (!Reductions.empty()) {
     DEBUG(dbgs() << "Outer loops with reductions are not supported "
                  << "currently.\n");
-    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
-                                       "ReductionsOuter",
-                                       OuterLoop->getStartLoc(),
-                                       OuterLoop->getHeader())
-              << "Outer loops with reductions cannot be interchangeed "
-                 "currently.");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "ReductionsOuter",
+                                      OuterLoop->getStartLoc(),
+                                      OuterLoop->getHeader())
+             << "Outer loops with reductions cannot be interchangeed "
+                "currently.";
+    });
     return true;
   }
   // TODO: Currently we handle only loops with 1 induction variable.
   if (Inductions.size() != 1) {
     DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
                  << "supported currently.\n");
-    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
-                                       "MultiIndutionOuter",
-                                       OuterLoop->getStartLoc(),
-                                       OuterLoop->getHeader())
-              << "Only outer loops with 1 induction variable can be "
-                 "interchanged currently.");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter",
+                                      OuterLoop->getStartLoc(),
+                                      OuterLoop->getHeader())
+             << "Only outer loops with 1 induction variable can be "
+                "interchanged currently.";
+    });
     return true;
   }
 
   // TODO: Triangular loops are not handled for now.
   if (!isLoopStructureUnderstood(InnerInductionVar)) {
     DEBUG(dbgs() << "Loop structure not understood by pass\n");
-    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
-                                       "UnsupportedStructureInner",
-                                       InnerLoop->getStartLoc(),
-                                       InnerLoop->getHeader())
-              << "Inner loop structure not understood currently.");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
+                                      InnerLoop->getStartLoc(),
+                                      InnerLoop->getHeader())
+             << "Inner loop structure not understood currently.";
+    });
     return true;
   }
 
@@ -857,24 +866,26 @@ bool LoopInterchangeLegality::currentLimitations() {
       getLoopLatchExitBlock(OuterLoopLatch, OuterLoopHeader);
   if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, true)) {
     DEBUG(dbgs() << "Can only handle LCSSA PHIs in outer loops currently.\n");
-    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
-                                       "NoLCSSAPHIOuter",
-                                       OuterLoop->getStartLoc(),
-                                       OuterLoop->getHeader())
-              << "Only outer loops with LCSSA PHIs can be interchange "
-                 "currently.");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuter",
+                                      OuterLoop->getStartLoc(),
+                                      OuterLoop->getHeader())
+             << "Only outer loops with LCSSA PHIs can be interchange "
+                "currently.";
+    });
     return true;
   }
 
   LoopExitBlock = getLoopLatchExitBlock(InnerLoopLatch, InnerLoopHeader);
   if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, false)) {
     DEBUG(dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n");
-    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
-                                       "NoLCSSAPHIOuterInner",
-                                       InnerLoop->getStartLoc(),
-                                       InnerLoop->getHeader())
-              << "Only inner loops with LCSSA PHIs can be interchange "
-                 "currently.");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuterInner",
+                                      InnerLoop->getStartLoc(),
+                                      InnerLoop->getHeader())
+             << "Only inner loops with LCSSA PHIs can be interchange "
+                "currently.";
+    });
     return true;
   }
 
@@ -899,11 +910,12 @@ bool LoopInterchangeLegality::currentLimitations() {
   if (!InnerIndexVarInc) {
     DEBUG(dbgs() << "Did not find an instruction to increment the induction "
                  << "variable.\n");
-    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
-                                       "NoIncrementInInner",
-                                       InnerLoop->getStartLoc(),
-                                       InnerLoop->getHeader())
-              << "The inner loop does not increment the induction variable.");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "NoIncrementInInner",
+                                      InnerLoop->getStartLoc(),
+                                      InnerLoop->getHeader())
+             << "The inner loop does not increment the induction variable.";
+    });
     return true;
   }
 
@@ -912,8 +924,9 @@ bool LoopInterchangeLegality::currentLimitations() {
   // instruction.
 
   bool FoundInduction = false;
-  for (const Instruction &I : reverse(*InnerLoopLatch)) {
-    if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I))
+  for (const Instruction &I : llvm::reverse(*InnerLoopLatch)) {
+    if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I) ||
+        isa<ZExtInst>(I))
       continue;
 
     // We found an instruction. If this is not induction variable then it is not
@@ -921,12 +934,13 @@ bool LoopInterchangeLegality::currentLimitations() {
     if (!I.isIdenticalTo(InnerIndexVarInc)) {
       DEBUG(dbgs() << "Found unsupported instructions between induction "
                    << "variable increment and branch.\n");
-    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
-                                       "UnsupportedInsBetweenInduction",
-                                       InnerLoop->getStartLoc(),
-                                       InnerLoop->getHeader())
-              << "Found unsupported instruction between induction variable "
-                 "increment and branch.");
+      ORE->emit([&]() {
+        return OptimizationRemarkMissed(
+                   DEBUG_TYPE, "UnsupportedInsBetweenInduction",
+                   InnerLoop->getStartLoc(), InnerLoop->getHeader())
+               << "Found unsupported instruction between induction variable "
+                  "increment and branch.";
+      });
       return true;
     }
 
@@ -937,11 +951,12 @@ bool LoopInterchangeLegality::currentLimitations() {
   // current limitation.
   if (!FoundInduction) {
     DEBUG(dbgs() << "Did not find the induction variable.\n");
-    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
-                                       "NoIndutionVariable",
-                                       InnerLoop->getStartLoc(),
-                                       InnerLoop->getHeader())
-              << "Did not find the induction variable.");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "NoIndutionVariable",
+                                      InnerLoop->getStartLoc(),
+                                      InnerLoop->getHeader())
+             << "Did not find the induction variable.";
+    });
     return true;
   }
   return false;
@@ -950,19 +965,31 @@ bool LoopInterchangeLegality::currentLimitations() {
 bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
                                                   unsigned OuterLoopId,
                                                   CharMatrix &DepMatrix) {
-
   if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
     DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
                  << " and OuterLoopId = " << OuterLoopId
                  << " due to dependence\n");
-    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
-                                       "Dependence",
-                                       InnerLoop->getStartLoc(),
-                                       InnerLoop->getHeader())
-              << "Cannot interchange loops due to dependences.");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
+                                      InnerLoop->getStartLoc(),
+                                      InnerLoop->getHeader())
+             << "Cannot interchange loops due to dependences.";
+    });
     return false;
   }
 
+  // Check if outer and inner loop contain legal instructions only.
+  for (auto *BB : OuterLoop->blocks())
+    for (Instruction &I : *BB)
+      if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+        // readnone functions do not prevent interchanging.
+        if (CI->doesNotReadMemory())
+          continue;
+        DEBUG(dbgs() << "Loops with call instructions cannot be interchanged "
+                     << "safely.");
+        return false;
+      }
+
   // Create unique Preheaders if we already do not have one.
   BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
   BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
@@ -995,12 +1022,13 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
   // Check if the loops are tightly nested.
   if (!tightlyNested(OuterLoop, InnerLoop)) {
     DEBUG(dbgs() << "Loops not tightly nested\n");
-    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
-                                       "NotTightlyNested",
-                                       InnerLoop->getStartLoc(),
-                                       InnerLoop->getHeader())
-              << "Cannot interchange loops because they are not tightly "
-                 "nested.");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
+                                      InnerLoop->getStartLoc(),
+                                      InnerLoop->getHeader())
+             << "Cannot interchange loops because they are not tightly "
+                "nested.";
+    });
     return false;
   }
 
@@ -1010,9 +1038,8 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
 int LoopInterchangeProfitability::getInstrOrderCost() {
   unsigned GoodOrder, BadOrder;
   BadOrder = GoodOrder = 0;
-  for (auto BI = InnerLoop->block_begin(), BE = InnerLoop->block_end();
-       BI != BE; ++BI) {
-    for (Instruction &Ins : **BI) {
+  for (BasicBlock *BB : InnerLoop->blocks()) {
+    for (Instruction &Ins : *BB) {
       if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
         unsigned NumOp = GEP->getNumOperands();
         bool FoundInnerInduction = false;
@@ -1064,12 +1091,11 @@ static bool isProfitableForVectorization(unsigned InnerLoopId,
   // TODO: Improve this heuristic to catch more cases.
   // If the inner loop is loop independent or doesn't carry any dependency it is
   // profitable to move this to outer position.
-  unsigned Row = DepMatrix.size();
-  for (unsigned i = 0; i < Row; ++i) {
-    if (DepMatrix[i][InnerLoopId] != 'S' && DepMatrix[i][InnerLoopId] != 'I')
+  for (auto &Row : DepMatrix) {
+    if (Row[InnerLoopId] != 'S' && Row[InnerLoopId] != 'I')
       return false;
     // TODO: We need to improve this heuristic.
-    if (DepMatrix[i][OuterLoopId] != '=')
+    if (Row[OuterLoopId] != '=')
       return false;
   }
   // If outer loop has dependence and inner loop is loop independent then it is
@@ -1080,7 +1106,6 @@ static bool isProfitableForVectorization(unsigned InnerLoopId,
 bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
                                                 unsigned OuterLoopId,
                                                 CharMatrix &DepMatrix) {
-
   // TODO: Add better profitability checks.
   // e.g
   // 1) Construct dependency matrix and move the one with no loop carried dep
@@ -1099,14 +1124,15 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
   if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix))
     return true;
 
-  ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
-                                     "InterchangeNotProfitable",
-                                     InnerLoop->getStartLoc(),
-                                     InnerLoop->getHeader())
-            << "Interchanging loops is too costly (cost="
-            << ore::NV("Cost", Cost) << ", threshold="
-            << ore::NV("Threshold", LoopInterchangeCostThreshold) <<
-            ") and it does not improve parallelism.");
+  ORE->emit([&]() {
+    return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
+                                    InnerLoop->getStartLoc(),
+                                    InnerLoop->getHeader())
+           << "Interchanging loops is too costly (cost="
+           << ore::NV("Cost", Cost) << ", threshold="
+           << ore::NV("Threshold", LoopInterchangeCostThreshold)
+           << ") and it does not improve parallelism.";
+  });
   return false;
 }
 
@@ -1145,7 +1171,7 @@ bool LoopInterchangeTransform::transform() {
   bool Transformed = false;
   Instruction *InnerIndexVar;
 
-  if (InnerLoop->getSubLoops().size() == 0) {
+  if (InnerLoop->getSubLoops().empty()) {
     BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
     DEBUG(dbgs() << "Calling Split Inner Loop\n");
     PHINode *InductionPHI = getInductionVariable(InnerLoop, SE);
@@ -1159,7 +1185,11 @@ bool LoopInterchangeTransform::transform() {
     else
       InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0));
 
-    //
+    // Ensure that InductionPHI is the first Phi node as required by
+    // splitInnerLoopHeader
+    if (&InductionPHI->getParent()->front() != InductionPHI)
+      InductionPHI->moveBefore(&InductionPHI->getParent()->front());
+
     // Split at the place were the induction variable is
     // incremented/decremented.
     // TODO: This splitting logic may not work always. Fix this.
@@ -1188,13 +1218,12 @@ void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) {
 }
 
 void LoopInterchangeTransform::splitInnerLoopHeader() {
-
   // Split the inner loop header out. Here make sure that the reduction PHI's
   // stay in the innerloop body.
   BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
   BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
   if (InnerLoopHasReduction) {
-    // FIXME: Check if the induction PHI will always be the first PHI.
+    // Note: The induction PHI must be the first PHI for this to work
     BasicBlock *New = InnerLoopHeader->splitBasicBlock(
         ++(InnerLoopHeader->begin()), InnerLoopHeader->getName() + ".split");
     if (LI)
@@ -1244,7 +1273,6 @@ void LoopInterchangeTransform::updateIncomingBlock(BasicBlock *CurrBlock,
 }
 
 bool LoopInterchangeTransform::adjustLoopBranches() {
-
   DEBUG(dbgs() << "adjustLoopBranches called\n");
   // Adjust the loop preheader
   BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
@@ -1352,8 +1380,8 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
 
   return true;
 }
-void LoopInterchangeTransform::adjustLoopPreheaders() {
 
+void LoopInterchangeTransform::adjustLoopPreheaders() {
   // We have interchanged the preheaders so we need to interchange the data in
   // the preheader as well.
   // This is because the content of inner preheader was previously executed
@@ -1373,7 +1401,6 @@ void LoopInterchangeTransform::adjustLoopPreheaders() {
 }
 
 bool LoopInterchangeTransform::adjustLoopLinks() {
-
   // Adjust all branches in the inner and outer loop.
   bool Changed = adjustLoopBranches();
   if (Changed)
@@ -1382,6 +1409,7 @@ bool LoopInterchangeTransform::adjustLoopLinks() {
 }
 
 char LoopInterchange::ID = 0;
+
 INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange",
                       "Interchanges loops for cache reuse", false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
diff --git a/lib/Transforms/Scalar/LoopLoadElimination.cpp b/lib/Transforms/Scalar/LoopLoadElimination.cpp
index 20b37c4b70e6..dfa5ec1f354d 100644
--- a/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -28,22 +28,29 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/LoopVersioning.h"
 #include <algorithm>
@@ -53,11 +60,11 @@
 #include <tuple>
 #include <utility>
 
+using namespace llvm;
+
 #define LLE_OPTION "loop-load-elim"
 #define DEBUG_TYPE LLE_OPTION
 
-using namespace llvm;
-
 static cl::opt<unsigned> CheckPerElim(
     "runtime-check-per-loop-load-elim", cl::Hidden,
     cl::desc("Max number of memchecks allowed per eliminated load on average"),
@@ -127,10 +134,12 @@ struct StoreToLoadForwardingCandidate {
 #endif
 };
 
+} // end anonymous namespace
+
 /// \brief Check if the store dominates all latches, so as long as there is no
 /// intervening store this value will be loaded in the next iteration.
-bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L,
-                                  DominatorTree *DT) {
+static bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L,
+                                         DominatorTree *DT) {
   SmallVector<BasicBlock *, 8> Latches;
   L->getLoopLatches(Latches);
   return llvm::all_of(Latches, [&](const BasicBlock *Latch) {
@@ -143,6 +152,8 @@ static bool isLoadConditional(LoadInst *Load, Loop *L) {
   return Load->getParent() != L->getHeader();
 }
 
+namespace {
+
 /// \brief The per-loop class that does most of the work.
 class LoadEliminationForLoop {
 public:
@@ -241,8 +252,8 @@ public:
       std::forward_list<StoreToLoadForwardingCandidate> &Candidates) {
     // If Store is nullptr it means that we have multiple stores forwarding to
     // this store.
-    typedef DenseMap<LoadInst *, const StoreToLoadForwardingCandidate *>
-        LoadToSingleCandT;
+    using LoadToSingleCandT =
+        DenseMap<LoadInst *, const StoreToLoadForwardingCandidate *>;
     LoadToSingleCandT LoadToSingleCand;
 
     for (const auto &Cand : Candidates) {
@@ -393,7 +404,6 @@ public:
   void
   propagateStoredValueToLoadUsers(const StoreToLoadForwardingCandidate &Cand,
                                   SCEVExpander &SEE) {
-    //
     // loop:
     //      %x = load %gep_i
     //         = ... %x
@@ -431,6 +441,7 @@ public:
   bool processLoop() {
     DEBUG(dbgs() << "\nIn \"" << L->getHeader()->getParent()->getName()
                  << "\" checking " << *L << "\n");
+
     // Look for store-to-load forwarding cases across the
     // backedge. E.g.:
     //
@@ -558,6 +569,8 @@ private:
   PredicatedScalarEvolution PSE;
 };
 
+} // end anonymous namespace
+
 static bool
 eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT,
                           function_ref<const LoopAccessInfo &(Loop &)> GetLAI) {
@@ -584,10 +597,14 @@ eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT,
   return Changed;
 }
 
+namespace {
+
 /// \brief The pass.  Most of the work is delegated to the per-loop
 /// LoadEliminationForLoop class.
 class LoopLoadElimination : public FunctionPass {
 public:
+  static char ID;
+
   LoopLoadElimination() : FunctionPass(ID) {
     initializeLoopLoadEliminationPass(*PassRegistry::getPassRegistry());
   }
@@ -616,13 +633,12 @@ public:
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
-
-  static char ID;
 };
 
 } // end anonymous namespace
 
 char LoopLoadElimination::ID;
+
 static const char LLE_name[] = "Loop Load Elimination";
 
 INITIALIZE_PASS_BEGIN(LoopLoadElimination, LLE_OPTION, LLE_name, false, false)
@@ -633,9 +649,7 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_END(LoopLoadElimination, LLE_OPTION, LLE_name, false, false)
 
-namespace llvm {
-
-FunctionPass *createLoopLoadEliminationPass() {
+FunctionPass *llvm::createLoopLoadEliminationPass() {
   return new LoopLoadElimination();
 }
 
@@ -652,7 +666,8 @@ PreservedAnalyses LoopLoadEliminationPass::run(Function &F,
   auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
   bool Changed = eliminateLoadsAcrossLoops(
       F, LI, DT, [&](Loop &L) -> const LoopAccessInfo & {
-        LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI};
+        LoopStandardAnalysisResults AR = {AA, AC,  DT,  LI,
+                                          SE, TLI, TTI, nullptr};
         return LAM.getResult<LoopAccessAnalysis>(L, AR);
       });
 
@@ -662,5 +677,3 @@ PreservedAnalyses LoopLoadEliminationPass::run(Function &F,
   PreservedAnalyses PA;
   return PA;
 }
-
-} // end namespace llvm
diff --git a/lib/Transforms/Scalar/LoopPredication.cpp b/lib/Transforms/Scalar/LoopPredication.cpp
index 9b12ba180444..2e4c7b19e476 100644
--- a/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/lib/Transforms/Scalar/LoopPredication.cpp
@@ -34,6 +34,143 @@
 //   else
 //     deoptimize
 //
+// It's tempting to rely on SCEV here, but it has proven to be problematic.
+// Generally the facts SCEV provides about the increment step of add
+// recurrences are true if the backedge of the loop is taken, which implicitly
+// assumes that the guard doesn't fail. Using these facts to optimize the
+// guard results in a circular logic where the guard is optimized under the
+// assumption that it never fails.
+//
+// For example, in the loop below the induction variable will be marked as nuw
+// basing on the guard. Basing on nuw the guard predicate will be considered
+// monotonic. Given a monotonic condition it's tempting to replace the induction
+// variable in the condition with its value on the last iteration. But this
+// transformation is not correct, e.g. e = 4, b = 5 breaks the loop.
+//
+//   for (int i = b; i != e; i++)
+//     guard(i u< len)
+//
+// One of the ways to reason about this problem is to use an inductive proof
+// approach. Given the loop:
+//
+//   if (B(0)) {
+//     do {
+//       I = PHI(0, I.INC)
+//       I.INC = I + Step
+//       guard(G(I));
+//     } while (B(I));
+//   }
+//
+// where B(x) and G(x) are predicates that map integers to booleans, we want a
+// loop invariant expression M such the following program has the same semantics
+// as the above:
+//
+//   if (B(0)) {
+//     do {
+//       I = PHI(0, I.INC)
+//       I.INC = I + Step
+//       guard(G(0) && M);
+//     } while (B(I));
+//   }
+//
+// One solution for M is M = forall X . (G(X) && B(X)) => G(X + Step)
+// 
+// Informal proof that the transformation above is correct:
+//
+//   By the definition of guards we can rewrite the guard condition to:
+//     G(I) && G(0) && M
+//
+//   Let's prove that for each iteration of the loop:
+//     G(0) && M => G(I)
+//   And the condition above can be simplified to G(Start) && M.
+// 
+//   Induction base.
+//     G(0) && M => G(0)
+//
+//   Induction step. Assuming G(0) && M => G(I) on the subsequent
+//   iteration:
+//
+//     B(I) is true because it's the backedge condition.
+//     G(I) is true because the backedge is guarded by this condition.
+//
+//   So M = forall X . (G(X) && B(X)) => G(X + Step) implies G(I + Step).
+//
+// Note that we can use anything stronger than M, i.e. any condition which
+// implies M.
+//
+// When S = 1 (i.e. forward iterating loop), the transformation is supported
+// when:
+//   * The loop has a single latch with the condition of the form:
+//     B(X) = latchStart + X <pred> latchLimit,
+//     where <pred> is u<, u<=, s<, or s<=.
+//   * The guard condition is of the form
+//     G(X) = guardStart + X u< guardLimit
+//
+//   For the ult latch comparison case M is:
+//     forall X . guardStart + X u< guardLimit && latchStart + X <u latchLimit =>
+//        guardStart + X + 1 u< guardLimit
+//
+//   The only way the antecedent can be true and the consequent can be false is
+//   if
+//     X == guardLimit - 1 - guardStart
+//   (and guardLimit is non-zero, but we won't use this latter fact).
+//   If X == guardLimit - 1 - guardStart then the second half of the antecedent is
+//     latchStart + guardLimit - 1 - guardStart u< latchLimit
+//   and its negation is
+//     latchStart + guardLimit - 1 - guardStart u>= latchLimit
+//
+//   In other words, if
+//     latchLimit u<= latchStart + guardLimit - 1 - guardStart
+//   then:
+//   (the ranges below are written in ConstantRange notation, where [A, B) is the
+//   set for (I = A; I != B; I++ /*maywrap*/) yield(I);)
+//
+//      forall X . guardStart + X u< guardLimit &&
+//                 latchStart + X u< latchLimit =>
+//        guardStart + X + 1 u< guardLimit
+//   == forall X . guardStart + X u< guardLimit &&
+//                 latchStart + X u< latchStart + guardLimit - 1 - guardStart =>
+//        guardStart + X + 1 u< guardLimit
+//   == forall X . (guardStart + X) in [0, guardLimit) &&
+//                 (latchStart + X) in [0, latchStart + guardLimit - 1 - guardStart) =>
+//        (guardStart + X + 1) in [0, guardLimit)
+//   == forall X . X in [-guardStart, guardLimit - guardStart) &&
+//                 X in [-latchStart, guardLimit - 1 - guardStart) =>
+//         X in [-guardStart - 1, guardLimit - guardStart - 1)
+//   == true
+//
+//   So the widened condition is:
+//     guardStart u< guardLimit &&
+//     latchStart + guardLimit - 1 - guardStart u>= latchLimit
+//   Similarly for ule condition the widened condition is:
+//     guardStart u< guardLimit &&
+//     latchStart + guardLimit - 1 - guardStart u> latchLimit
+//   For slt condition the widened condition is:
+//     guardStart u< guardLimit &&
+//     latchStart + guardLimit - 1 - guardStart s>= latchLimit
+//   For sle condition the widened condition is:
+//     guardStart u< guardLimit &&
+//     latchStart + guardLimit - 1 - guardStart s> latchLimit
+//
+// When S = -1 (i.e. reverse iterating loop), the transformation is supported
+// when:
+//   * The loop has a single latch with the condition of the form:
+//     B(X) = X <pred> latchLimit, where <pred> is u> or s>.
+//   * The guard condition is of the form
+//     G(X) = X - 1 u< guardLimit
+//
+//   For the ugt latch comparison case M is:
+//     forall X. X-1 u< guardLimit and X u> latchLimit => X-2 u< guardLimit
+//
+//   The only way the antecedent can be true and the consequent can be false is if
+//     X == 1.
+//   If X == 1 then the second half of the antecedent is
+//     1 u> latchLimit, and its negation is latchLimit u>= 1.
+//
+//   So the widened condition is:
+//     guardStart u< guardLimit && latchLimit u>= 1.
+//   Similarly for sgt condition the widened condition is:
+//     guardStart u< guardLimit && latchLimit s>= 1.
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/LoopPredication.h"
@@ -56,6 +193,11 @@
 
 using namespace llvm;
 
+static cl::opt<bool> EnableIVTruncation("loop-predication-enable-iv-truncation",
+                                        cl::Hidden, cl::init(true));
+
+static cl::opt<bool> EnableCountDownLoop("loop-predication-enable-count-down-loop",
+                                        cl::Hidden, cl::init(true));
 namespace {
 class LoopPredication {
   /// Represents an induction variable check:
@@ -68,6 +210,10 @@ class LoopPredication {
              const SCEV *Limit)
         : Pred(Pred), IV(IV), Limit(Limit) {}
     LoopICmp() {}
+    void dump() {
+      dbgs() << "LoopICmp Pred = " << Pred << ", IV = " << *IV
+             << ", Limit = " << *Limit << "\n";
+    }
   };
 
   ScalarEvolution *SE;
@@ -75,17 +221,51 @@ class LoopPredication {
   Loop *L;
   const DataLayout *DL;
   BasicBlock *Preheader;
+  LoopICmp LatchCheck;
 
-  Optional<LoopICmp> parseLoopICmp(ICmpInst *ICI);
+  bool isSupportedStep(const SCEV* Step);
+  Optional<LoopICmp> parseLoopICmp(ICmpInst *ICI) {
+    return parseLoopICmp(ICI->getPredicate(), ICI->getOperand(0),
+                         ICI->getOperand(1));
+  }
+  Optional<LoopICmp> parseLoopICmp(ICmpInst::Predicate Pred, Value *LHS,
+                                   Value *RHS);
+
+  Optional<LoopICmp> parseLoopLatchICmp();
 
+  bool CanExpand(const SCEV* S);
   Value *expandCheck(SCEVExpander &Expander, IRBuilder<> &Builder,
                      ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS,
                      Instruction *InsertAt);
 
   Optional<Value *> widenICmpRangeCheck(ICmpInst *ICI, SCEVExpander &Expander,
                                         IRBuilder<> &Builder);
+  Optional<Value *> widenICmpRangeCheckIncrementingLoop(LoopICmp LatchCheck,
+                                                        LoopICmp RangeCheck,
+                                                        SCEVExpander &Expander,
+                                                        IRBuilder<> &Builder);
+  Optional<Value *> widenICmpRangeCheckDecrementingLoop(LoopICmp LatchCheck,
+                                                        LoopICmp RangeCheck,
+                                                        SCEVExpander &Expander,
+                                                        IRBuilder<> &Builder);
   bool widenGuardConditions(IntrinsicInst *II, SCEVExpander &Expander);
 
+  // When the IV type is wider than the range operand type, we can still do loop
+  // predication, by generating SCEVs for the range and latch that are of the
+  // same type. We achieve this by generating a SCEV truncate expression for the
+  // latch IV. This is done iff truncation of the IV is a safe operation,
+  // without loss of information.
+  // Another way to achieve this is by generating a wider type SCEV for the
+  // range check operand, however, this needs a more involved check that
+  // operands do not overflow. This can lead to loss of information when the
+  // range operand is of the form: add i32 %offset, %iv. We need to prove that
+  // sext(x + y) is same as sext(x) + sext(y).
+  // This function returns true if we can safely represent the IV type in
+  // the RangeCheckType without loss of information.
+  bool isSafeToTruncateWideIVType(Type *RangeCheckType);
+  // Return the loopLatchCheck corresponding to the RangeCheckType if safe to do
+  // so.
+  Optional<LoopICmp> generateLoopLatchCheck(Type *RangeCheckType);
 public:
   LoopPredication(ScalarEvolution *SE) : SE(SE){};
   bool runOnLoop(Loop *L);
@@ -135,11 +315,8 @@ PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM,
 }
 
 Optional<LoopPredication::LoopICmp>
-LoopPredication::parseLoopICmp(ICmpInst *ICI) {
-  ICmpInst::Predicate Pred = ICI->getPredicate();
-
-  Value *LHS = ICI->getOperand(0);
-  Value *RHS = ICI->getOperand(1);
+LoopPredication::parseLoopICmp(ICmpInst::Predicate Pred, Value *LHS,
+                               Value *RHS) {
   const SCEV *LHSS = SE->getSCEV(LHS);
   if (isa<SCEVCouldNotCompute>(LHSS))
     return None;
@@ -165,13 +342,146 @@ Value *LoopPredication::expandCheck(SCEVExpander &Expander,
                                     IRBuilder<> &Builder,
                                     ICmpInst::Predicate Pred, const SCEV *LHS,
                                     const SCEV *RHS, Instruction *InsertAt) {
+  // TODO: we can check isLoopEntryGuardedByCond before emitting the check
+ 
   Type *Ty = LHS->getType();
   assert(Ty == RHS->getType() && "expandCheck operands have different types?");
+
+  if (SE->isLoopEntryGuardedByCond(L, Pred, LHS, RHS))
+    return Builder.getTrue();
+
   Value *LHSV = Expander.expandCodeFor(LHS, Ty, InsertAt);
   Value *RHSV = Expander.expandCodeFor(RHS, Ty, InsertAt);
   return Builder.CreateICmp(Pred, LHSV, RHSV);
 }
 
+Optional<LoopPredication::LoopICmp>
+LoopPredication::generateLoopLatchCheck(Type *RangeCheckType) {
+
+  auto *LatchType = LatchCheck.IV->getType();
+  if (RangeCheckType == LatchType)
+    return LatchCheck;
+  // For now, bail out if latch type is narrower than range type.
+  if (DL->getTypeSizeInBits(LatchType) < DL->getTypeSizeInBits(RangeCheckType))
+    return None;
+  if (!isSafeToTruncateWideIVType(RangeCheckType))
+    return None;
+  // We can now safely identify the truncated version of the IV and limit for
+  // RangeCheckType.
+  LoopICmp NewLatchCheck;
+  NewLatchCheck.Pred = LatchCheck.Pred;
+  NewLatchCheck.IV = dyn_cast<SCEVAddRecExpr>(
+      SE->getTruncateExpr(LatchCheck.IV, RangeCheckType));
+  if (!NewLatchCheck.IV)
+    return None;
+  NewLatchCheck.Limit = SE->getTruncateExpr(LatchCheck.Limit, RangeCheckType);
+  DEBUG(dbgs() << "IV of type: " << *LatchType
+               << "can be represented as range check type:" << *RangeCheckType
+               << "\n");
+  DEBUG(dbgs() << "LatchCheck.IV: " << *NewLatchCheck.IV << "\n");
+  DEBUG(dbgs() << "LatchCheck.Limit: " << *NewLatchCheck.Limit << "\n");
+  return NewLatchCheck;
+}
+
+bool LoopPredication::isSupportedStep(const SCEV* Step) {
+  return Step->isOne() || (Step->isAllOnesValue() && EnableCountDownLoop);
+}
+
+bool LoopPredication::CanExpand(const SCEV* S) {
+  return SE->isLoopInvariant(S, L) && isSafeToExpand(S, *SE);
+}
+
+Optional<Value *> LoopPredication::widenICmpRangeCheckIncrementingLoop(
+    LoopPredication::LoopICmp LatchCheck, LoopPredication::LoopICmp RangeCheck,
+    SCEVExpander &Expander, IRBuilder<> &Builder) {
+  auto *Ty = RangeCheck.IV->getType();
+  // Generate the widened condition for the forward loop:
+  //   guardStart u< guardLimit &&
+  //   latchLimit <pred> guardLimit - 1 - guardStart + latchStart
+  // where <pred> depends on the latch condition predicate. See the file
+  // header comment for the reasoning.
+  // guardLimit - guardStart + latchStart - 1
+  const SCEV *GuardStart = RangeCheck.IV->getStart();
+  const SCEV *GuardLimit = RangeCheck.Limit;
+  const SCEV *LatchStart = LatchCheck.IV->getStart();
+  const SCEV *LatchLimit = LatchCheck.Limit;
+
+  // guardLimit - guardStart + latchStart - 1
+  const SCEV *RHS =
+      SE->getAddExpr(SE->getMinusSCEV(GuardLimit, GuardStart),
+                     SE->getMinusSCEV(LatchStart, SE->getOne(Ty)));
+  if (!CanExpand(GuardStart) || !CanExpand(GuardLimit) ||
+      !CanExpand(LatchLimit) || !CanExpand(RHS)) {
+    DEBUG(dbgs() << "Can't expand limit check!\n");
+    return None;
+  }
+  ICmpInst::Predicate LimitCheckPred;
+  switch (LatchCheck.Pred) {
+  case ICmpInst::ICMP_ULT:
+    LimitCheckPred = ICmpInst::ICMP_ULE;
+    break;
+  case ICmpInst::ICMP_ULE:
+    LimitCheckPred = ICmpInst::ICMP_ULT;
+    break;
+  case ICmpInst::ICMP_SLT:
+    LimitCheckPred = ICmpInst::ICMP_SLE;
+    break;
+  case ICmpInst::ICMP_SLE:
+    LimitCheckPred = ICmpInst::ICMP_SLT;
+    break;
+  default:
+    llvm_unreachable("Unsupported loop latch!");
+  }
+
+  DEBUG(dbgs() << "LHS: " << *LatchLimit << "\n");
+  DEBUG(dbgs() << "RHS: " << *RHS << "\n");
+  DEBUG(dbgs() << "Pred: " << LimitCheckPred << "\n");
+
+  Instruction *InsertAt = Preheader->getTerminator();
+  auto *LimitCheck =
+      expandCheck(Expander, Builder, LimitCheckPred, LatchLimit, RHS, InsertAt);
+  auto *FirstIterationCheck = expandCheck(Expander, Builder, RangeCheck.Pred,
+                                          GuardStart, GuardLimit, InsertAt);
+  return Builder.CreateAnd(FirstIterationCheck, LimitCheck);
+}
+
+Optional<Value *> LoopPredication::widenICmpRangeCheckDecrementingLoop(
+    LoopPredication::LoopICmp LatchCheck, LoopPredication::LoopICmp RangeCheck,
+    SCEVExpander &Expander, IRBuilder<> &Builder) {
+  auto *Ty = RangeCheck.IV->getType();
+  const SCEV *GuardStart = RangeCheck.IV->getStart();
+  const SCEV *GuardLimit = RangeCheck.Limit;
+  const SCEV *LatchLimit = LatchCheck.Limit;
+  if (!CanExpand(GuardStart) || !CanExpand(GuardLimit) ||
+      !CanExpand(LatchLimit)) {
+    DEBUG(dbgs() << "Can't expand limit check!\n");
+    return None;
+  }
+  // The decrement of the latch check IV should be the same as the
+  // rangeCheckIV.
+  auto *PostDecLatchCheckIV = LatchCheck.IV->getPostIncExpr(*SE);
+  if (RangeCheck.IV != PostDecLatchCheckIV) {
+    DEBUG(dbgs() << "Not the same. PostDecLatchCheckIV: "
+                 << *PostDecLatchCheckIV
+                 << "  and RangeCheckIV: " << *RangeCheck.IV << "\n");
+    return None;
+  }
+
+  // Generate the widened condition for CountDownLoop:
+  // guardStart u< guardLimit &&
+  // latchLimit <pred> 1.
+  // See the header comment for reasoning of the checks.
+  Instruction *InsertAt = Preheader->getTerminator();
+  auto LimitCheckPred = ICmpInst::isSigned(LatchCheck.Pred)
+                            ? ICmpInst::ICMP_SGE
+                            : ICmpInst::ICMP_UGE;
+  auto *FirstIterationCheck = expandCheck(Expander, Builder, ICmpInst::ICMP_ULT,
+                                          GuardStart, GuardLimit, InsertAt);
+  auto *LimitCheck = expandCheck(Expander, Builder, LimitCheckPred, LatchLimit,
+                                 SE->getOne(Ty), InsertAt);
+  return Builder.CreateAnd(FirstIterationCheck, LimitCheck);
+}
+
 /// If ICI can be widened to a loop invariant condition emits the loop
 /// invariant condition in the loop preheader and return it, otherwise
 /// returns None.
@@ -181,51 +491,62 @@ Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
   DEBUG(dbgs() << "Analyzing ICmpInst condition:\n");
   DEBUG(ICI->dump());
 
+  // parseLoopStructure guarantees that the latch condition is:
+  //   ++i <pred> latchLimit, where <pred> is u<, u<=, s<, or s<=.
+  // We are looking for the range checks of the form:
+  //   i u< guardLimit
   auto RangeCheck = parseLoopICmp(ICI);
   if (!RangeCheck) {
     DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
     return None;
   }
-
-  ICmpInst::Predicate Pred = RangeCheck->Pred;
-  const SCEVAddRecExpr *IndexAR = RangeCheck->IV;
-  const SCEV *RHSS = RangeCheck->Limit;
-
-  auto CanExpand = [this](const SCEV *S) {
-    return SE->isLoopInvariant(S, L) && isSafeToExpand(S, *SE);
-  };
-  if (!CanExpand(RHSS))
+  DEBUG(dbgs() << "Guard check:\n");
+  DEBUG(RangeCheck->dump());
+  if (RangeCheck->Pred != ICmpInst::ICMP_ULT) {
+    DEBUG(dbgs() << "Unsupported range check predicate(" << RangeCheck->Pred
+                 << ")!\n");
     return None;
-
-  DEBUG(dbgs() << "IndexAR: ");
-  DEBUG(IndexAR->dump());
-
-  bool IsIncreasing = false;
-  if (!SE->isMonotonicPredicate(IndexAR, Pred, IsIncreasing))
+  }
+  auto *RangeCheckIV = RangeCheck->IV;
+  if (!RangeCheckIV->isAffine()) {
+    DEBUG(dbgs() << "Range check IV is not affine!\n");
     return None;
-
-  // If the predicate is increasing the condition can change from false to true
-  // as the loop progresses, in this case take the value on the first iteration
-  // for the widened check. Otherwise the condition can change from true to
-  // false as the loop progresses, so take the value on the last iteration.
-  const SCEV *NewLHSS = IsIncreasing
-                            ? IndexAR->getStart()
-                            : SE->getSCEVAtScope(IndexAR, L->getParentLoop());
-  if (NewLHSS == IndexAR) {
-    DEBUG(dbgs() << "Can't compute NewLHSS!\n");
+  }
+  auto *Step = RangeCheckIV->getStepRecurrence(*SE);
+  // We cannot just compare with latch IV step because the latch and range IVs
+  // may have different types.
+  if (!isSupportedStep(Step)) {
+    DEBUG(dbgs() << "Range check and latch have IVs different steps!\n");
     return None;
   }
-
-  DEBUG(dbgs() << "NewLHSS: ");
-  DEBUG(NewLHSS->dump());
-
-  if (!CanExpand(NewLHSS))
+  auto *Ty = RangeCheckIV->getType();
+  auto CurrLatchCheckOpt = generateLoopLatchCheck(Ty);
+  if (!CurrLatchCheckOpt) {
+    DEBUG(dbgs() << "Failed to generate a loop latch check "
+                    "corresponding to range type: "
+                 << *Ty << "\n");
     return None;
+  }
 
-  DEBUG(dbgs() << "NewLHSS is loop invariant and safe to expand. Expand!\n");
+  LoopICmp CurrLatchCheck = *CurrLatchCheckOpt;
+  // At this point, the range and latch step should have the same type, but need
+  // not have the same value (we support both 1 and -1 steps).
+  assert(Step->getType() ==
+             CurrLatchCheck.IV->getStepRecurrence(*SE)->getType() &&
+         "Range and latch steps should be of same type!");
+  if (Step != CurrLatchCheck.IV->getStepRecurrence(*SE)) {
+    DEBUG(dbgs() << "Range and latch have different step values!\n");
+    return None;
+  }
 
-  Instruction *InsertAt = Preheader->getTerminator();
-  return expandCheck(Expander, Builder, Pred, NewLHSS, RHSS, InsertAt);
+  if (Step->isOne())
+    return widenICmpRangeCheckIncrementingLoop(CurrLatchCheck, *RangeCheck,
+                                               Expander, Builder);
+  else {
+    assert(Step->isAllOnesValue() && "Step should be -1!");
+    return widenICmpRangeCheckDecrementingLoop(CurrLatchCheck, *RangeCheck,
+                                               Expander, Builder);
+  }
 }
 
 bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
@@ -288,6 +609,97 @@ bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
   return true;
 }
 
+Optional<LoopPredication::LoopICmp> LoopPredication::parseLoopLatchICmp() {
+  using namespace PatternMatch;
+
+  BasicBlock *LoopLatch = L->getLoopLatch();
+  if (!LoopLatch) {
+    DEBUG(dbgs() << "The loop doesn't have a single latch!\n");
+    return None;
+  }
+
+  ICmpInst::Predicate Pred;
+  Value *LHS, *RHS;
+  BasicBlock *TrueDest, *FalseDest;
+
+  if (!match(LoopLatch->getTerminator(),
+             m_Br(m_ICmp(Pred, m_Value(LHS), m_Value(RHS)), TrueDest,
+                  FalseDest))) {
+    DEBUG(dbgs() << "Failed to match the latch terminator!\n");
+    return None;
+  }
+  assert((TrueDest == L->getHeader() || FalseDest == L->getHeader()) &&
+         "One of the latch's destinations must be the header");
+  if (TrueDest != L->getHeader())
+    Pred = ICmpInst::getInversePredicate(Pred);
+
+  auto Result = parseLoopICmp(Pred, LHS, RHS);
+  if (!Result) {
+    DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
+    return None;
+  }
+
+  // Check affine first, so if it's not we don't try to compute the step
+  // recurrence.
+  if (!Result->IV->isAffine()) {
+    DEBUG(dbgs() << "The induction variable is not affine!\n");
+    return None;
+  }
+
+  auto *Step = Result->IV->getStepRecurrence(*SE);
+  if (!isSupportedStep(Step)) {
+    DEBUG(dbgs() << "Unsupported loop stride(" << *Step << ")!\n");
+    return None;
+  }
+
+  auto IsUnsupportedPredicate = [](const SCEV *Step, ICmpInst::Predicate Pred) {
+    if (Step->isOne()) {
+      return Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_SLT &&
+             Pred != ICmpInst::ICMP_ULE && Pred != ICmpInst::ICMP_SLE;
+    } else {
+      assert(Step->isAllOnesValue() && "Step should be -1!");
+      return Pred != ICmpInst::ICMP_UGT && Pred != ICmpInst::ICMP_SGT;
+    }
+  };
+
+  if (IsUnsupportedPredicate(Step, Result->Pred)) {
+    DEBUG(dbgs() << "Unsupported loop latch predicate(" << Result->Pred
+                 << ")!\n");
+    return None;
+  }
+  return Result;
+}
+
+// Returns true if its safe to truncate the IV to RangeCheckType.
+bool LoopPredication::isSafeToTruncateWideIVType(Type *RangeCheckType) {
+  if (!EnableIVTruncation)
+    return false;
+  assert(DL->getTypeSizeInBits(LatchCheck.IV->getType()) >
+             DL->getTypeSizeInBits(RangeCheckType) &&
+         "Expected latch check IV type to be larger than range check operand "
+         "type!");
+  // The start and end values of the IV should be known. This is to guarantee
+  // that truncating the wide type will not lose information.
+  auto *Limit = dyn_cast<SCEVConstant>(LatchCheck.Limit);
+  auto *Start = dyn_cast<SCEVConstant>(LatchCheck.IV->getStart());
+  if (!Limit || !Start)
+    return false;
+  // This check makes sure that the IV does not change sign during loop
+  // iterations. Consider latchType = i64, LatchStart = 5, Pred = ICMP_SGE,
+  // LatchEnd = 2, rangeCheckType = i32. If it's not a monotonic predicate, the
+  // IV wraps around, and the truncation of the IV would lose the range of
+  // iterations between 2^32 and 2^64.
+  bool Increasing;
+  if (!SE->isMonotonicPredicate(LatchCheck.IV, LatchCheck.Pred, Increasing))
+    return false;
+  // The active bits should be less than the bits in the RangeCheckType. This
+  // guarantees that truncating the latch check to RangeCheckType is a safe
+  // operation.
+  auto RangeCheckTypeBitSize = DL->getTypeSizeInBits(RangeCheckType);
+  return Start->getAPInt().getActiveBits() < RangeCheckTypeBitSize &&
+         Limit->getAPInt().getActiveBits() < RangeCheckTypeBitSize;
+}
+
 bool LoopPredication::runOnLoop(Loop *Loop) {
   L = Loop;
 
@@ -308,6 +720,14 @@ bool LoopPredication::runOnLoop(Loop *Loop) {
   if (!Preheader)
     return false;
 
+  auto LatchCheckOpt = parseLoopLatchICmp();
+  if (!LatchCheckOpt)
+    return false;
+  LatchCheck = *LatchCheckOpt;
+
+  DEBUG(dbgs() << "Latch check:\n");
+  DEBUG(LatchCheck.dump());
+
   // Collect all the guards into a vector and process later, so as not
   // to invalidate the instruction iterator.
   SmallVector<IntrinsicInst *, 4> Guards;
diff --git a/lib/Transforms/Scalar/LoopRerollPass.cpp b/lib/Transforms/Scalar/LoopRerollPass.cpp
index fc0216e76a5b..d1a54b877950 100644
--- a/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -1,4 +1,4 @@
-//===-- LoopReroll.cpp - Loop rerolling pass ------------------------------===//
+//===- LoopReroll.cpp - Loop rerolling pass -------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,22 +11,42 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -34,6 +54,13 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+#include <map>
+#include <utility>
 
 using namespace llvm;
 
@@ -127,6 +154,7 @@ NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400),
 // br %cmp, header, exit
 
 namespace {
+
   enum IterationLimits {
     /// The maximum number of iterations that we'll try and reroll.
     IL_MaxRerollIterations = 32,
@@ -139,6 +167,7 @@ namespace {
   class LoopReroll : public LoopPass {
   public:
     static char ID; // Pass ID, replacement for typeid
+
     LoopReroll() : LoopPass(ID) {
       initializeLoopRerollPass(*PassRegistry::getPassRegistry());
     }
@@ -158,11 +187,12 @@ namespace {
     DominatorTree *DT;
     bool PreserveLCSSA;
 
-    typedef SmallVector<Instruction *, 16> SmallInstructionVector;
-    typedef SmallSet<Instruction *, 16>   SmallInstructionSet;
+    using SmallInstructionVector = SmallVector<Instruction *, 16>;
+    using SmallInstructionSet = SmallSet<Instruction *, 16>;
 
     // Map between induction variable and its increment
     DenseMap<Instruction *, int64_t> IVToIncMap;
+
     // For loop with multiple induction variable, remember the one used only to
     // control the loop.
     Instruction *LoopControlIV;
@@ -171,8 +201,7 @@ namespace {
     // representing a reduction. Only the last value may be used outside the
     // loop.
     struct SimpleLoopReduction {
-      SimpleLoopReduction(Instruction *P, Loop *L)
-        : Valid(false), Instructions(1, P) {
+      SimpleLoopReduction(Instruction *P, Loop *L) : Instructions(1, P) {
         assert(isa<PHINode>(P) && "First reduction instruction must be a PHI");
         add(L);
       }
@@ -204,8 +233,8 @@ namespace {
         return Instructions.size()-1;
       }
 
-      typedef SmallInstructionVector::iterator iterator;
-      typedef SmallInstructionVector::const_iterator const_iterator;
+      using iterator = SmallInstructionVector::iterator;
+      using const_iterator = SmallInstructionVector::const_iterator;
 
       iterator begin() {
         assert(Valid && "Using invalid reduction");
@@ -221,7 +250,7 @@ namespace {
       const_iterator end() const { return Instructions.end(); }
 
     protected:
-      bool Valid;
+      bool Valid = false;
       SmallInstructionVector Instructions;
 
       void add(Loop *L);
@@ -230,7 +259,7 @@ namespace {
     // The set of all reductions, and state tracking of possible reductions
     // during loop instruction processing.
     struct ReductionTracker {
-      typedef SmallVector<SimpleLoopReduction, 16> SmallReductionVector;
+      using SmallReductionVector = SmallVector<SimpleLoopReduction, 16>;
 
       // Add a new possible reduction.
       void addSLR(SimpleLoopReduction &SLR) { PossibleReds.push_back(SLR); }
@@ -342,6 +371,7 @@ namespace {
     struct DAGRootSet {
       Instruction *BaseInst;
       SmallInstructionVector Roots;
+
       // The instructions between IV and BaseInst (but not including BaseInst).
       SmallInstructionSet SubsumedInsts;
     };
@@ -361,15 +391,17 @@ namespace {
 
       /// Stage 1: Find all the DAG roots for the induction variable.
       bool findRoots();
+
       /// Stage 2: Validate if the found roots are valid.
       bool validate(ReductionTracker &Reductions);
+
       /// Stage 3: Assuming validate() returned true, perform the
       /// replacement.
       /// @param IterCount The maximum iteration count of L.
       void replace(const SCEV *IterCount);
 
     protected:
-      typedef MapVector<Instruction*, BitVector> UsesTy;
+      using UsesTy = MapVector<Instruction *, BitVector>;
 
       void findRootsRecursive(Instruction *IVU,
                               SmallInstructionSet SubsumedInsts);
@@ -412,22 +444,29 @@ namespace {
 
       // The loop induction variable.
       Instruction *IV;
+
       // Loop step amount.
       int64_t Inc;
+
       // Loop reroll count; if Inc == 1, this records the scaling applied
       // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ;
       // If Inc is not 1, Scale = Inc.
       uint64_t Scale;
+
       // The roots themselves.
       SmallVector<DAGRootSet,16> RootSets;
+
       // All increment instructions for IV.
       SmallInstructionVector LoopIncs;
+
       // Map of all instructions in the loop (in order) to the iterations
       // they are used in (or specially, IL_All for instructions
       // used in the loop increment mechanism).
       UsesTy Uses;
+
       // Map between induction variable and its increment
       DenseMap<Instruction *, int64_t> &IVToIncMap;
+
       Instruction *LoopControlIV;
     };
 
@@ -446,9 +485,11 @@ namespace {
     bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount,
                 ReductionTracker &Reductions);
   };
-}
+
+} // end anonymous namespace
 
 char LoopReroll::ID = 0;
+
 INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false)
 INITIALIZE_PASS_DEPENDENCY(LoopPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
@@ -1069,7 +1110,6 @@ bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &Po
   }
 
   return true;
-
 }
 
 /// Get the next instruction in "In" that is a member of set Val.
@@ -1124,7 +1164,7 @@ static bool isIgnorableInst(const Instruction *I) {
   switch (II->getIntrinsicID()) {
     default:
       return false;
-    case llvm::Intrinsic::annotation:
+    case Intrinsic::annotation:
     case Intrinsic::ptr_annotation:
     case Intrinsic::var_annotation:
     // TODO: the following intrinsics may also be whitelisted:
@@ -1407,8 +1447,8 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
       BaseIt = nextInstr(0, Uses, Visited);
       RootIt = nextInstr(Iter, Uses, Visited);
     }
-    assert (BaseIt == Uses.end() && RootIt == Uses.end() &&
-            "Mismatched set sizes!");
+    assert(BaseIt == Uses.end() && RootIt == Uses.end() &&
+           "Mismatched set sizes!");
   }
 
   DEBUG(dbgs() << "LRR: Matched all iteration increments for " <<
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index 3506ac343d59..a91f53ba663f 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CFG.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -141,37 +142,29 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
 
     // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug
     // intrinsics.
-    LLVMContext &C = OrigHeader->getContext();
-    if (auto *VAM = ValueAsMetadata::getIfExists(OrigHeaderVal)) {
-      if (auto *MAV = MetadataAsValue::getIfExists(C, VAM)) {
-        for (auto UI = MAV->use_begin(), E = MAV->use_end(); UI != E;) {
-          // Grab the use before incrementing the iterator. Otherwise, altering
-          // the Use will invalidate the iterator.
-          Use &U = *UI++;
-          DbgInfoIntrinsic *UserInst = dyn_cast<DbgInfoIntrinsic>(U.getUser());
-          if (!UserInst)
-            continue;
-
-          // The original users in the OrigHeader are already using the original
-          // definitions.
-          BasicBlock *UserBB = UserInst->getParent();
-          if (UserBB == OrigHeader)
-            continue;
+    SmallVector<DbgValueInst *, 1> DbgValues;
+    llvm::findDbgValues(DbgValues, OrigHeaderVal);
+    for (auto &DbgValue : DbgValues) {
+      // The original users in the OrigHeader are already using the original
+      // definitions.
+      BasicBlock *UserBB = DbgValue->getParent();
+      if (UserBB == OrigHeader)
+        continue;
 
-          // Users in the OrigPreHeader need to use the value to which the
-          // original definitions are mapped and anything else can be handled by
-          // the SSAUpdater. To avoid adding PHINodes, check if the value is
-          // available in UserBB, if not substitute undef.
-          Value *NewVal;
-          if (UserBB == OrigPreheader)
-            NewVal = OrigPreHeaderVal;
-          else if (SSA.HasValueForBlock(UserBB))
-            NewVal = SSA.GetValueInMiddleOfBlock(UserBB);
-          else
-            NewVal = UndefValue::get(OrigHeaderVal->getType());
-          U = MetadataAsValue::get(C, ValueAsMetadata::get(NewVal));
-        }
-      }
+      // Users in the OrigPreHeader need to use the value to which the
+      // original definitions are mapped and anything else can be handled by
+      // the SSAUpdater. To avoid adding PHINodes, check if the value is
+      // available in UserBB, if not substitute undef.
+      Value *NewVal;
+      if (UserBB == OrigPreheader)
+        NewVal = OrigPreHeaderVal;
+      else if (SSA.HasValueForBlock(UserBB))
+        NewVal = SSA.GetValueInMiddleOfBlock(UserBB);
+      else
+        NewVal = UndefValue::get(OrigHeaderVal->getType());
+      DbgValue->setOperand(0,
+                           MetadataAsValue::get(OrigHeaderVal->getContext(),
+                                                ValueAsMetadata::get(NewVal)));
     }
   }
 }
@@ -315,6 +308,22 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
   // For the rest of the instructions, either hoist to the OrigPreheader if
   // possible or create a clone in the OldPreHeader if not.
   TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator();
+
+  // Record all debug intrinsics preceding LoopEntryBranch to avoid duplication.
+  using DbgIntrinsicHash =
+      std::pair<std::pair<Value *, DILocalVariable *>, DIExpression *>;
+  auto makeHash = [](DbgInfoIntrinsic *D) -> DbgIntrinsicHash {
+    return {{D->getVariableLocation(), D->getVariable()}, D->getExpression()};
+  };
+  SmallDenseSet<DbgIntrinsicHash, 8> DbgIntrinsics;
+  for (auto I = std::next(OrigPreheader->rbegin()), E = OrigPreheader->rend();
+       I != E; ++I) {
+    if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&*I))
+      DbgIntrinsics.insert(makeHash(DII));
+    else
+      break;
+  }
+
   while (I != E) {
     Instruction *Inst = &*I++;
 
@@ -338,6 +347,13 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     RemapInstruction(C, ValueMap,
                      RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
 
+    // Avoid inserting the same intrinsic twice.
+    if (auto *DII = dyn_cast<DbgInfoIntrinsic>(C))
+      if (DbgIntrinsics.count(makeHash(DII))) {
+        C->deleteValue();
+        continue;
+      }
+
     // With the operands remapped, see if the instruction constant folds or is
     // otherwise simplifyable.  This commonly occurs because the entry from PHI
     // nodes allows icmps and other instructions to fold.
@@ -395,6 +411,17 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
   L->moveToHeader(NewHeader);
   assert(L->getHeader() == NewHeader && "Latch block is our new header");
 
+  // Inform DT about changes to the CFG.
+  if (DT) {
+    // The OrigPreheader branches to the NewHeader and Exit now. Then, inform
+    // the DT about the removed edge to the OrigHeader (that got removed).
+    SmallVector<DominatorTree::UpdateType, 3> Updates;
+    Updates.push_back({DominatorTree::Insert, OrigPreheader, Exit});
+    Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader});
+    Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader});
+    DT->applyUpdates(Updates);
+  }
+
   // At this point, we've finished our major CFG changes.  As part of cloning
   // the loop into the preheader we've simplified instructions and the
   // duplicated conditional branch may now be branching on a constant.  If it is
@@ -408,26 +435,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
       PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) !=
           NewHeader) {
     // The conditional branch can't be folded, handle the general case.
-    // Update DominatorTree to reflect the CFG change we just made.  Then split
-    // edges as necessary to preserve LoopSimplify form.
-    if (DT) {
-      // Everything that was dominated by the old loop header is now dominated
-      // by the original loop preheader. Conceptually the header was merged
-      // into the preheader, even though we reuse the actual block as a new
-      // loop latch.
-      DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
-      SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
-                                                   OrigHeaderNode->end());
-      DomTreeNode *OrigPreheaderNode = DT->getNode(OrigPreheader);
-      for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I)
-        DT->changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode);
-
-      assert(DT->getNode(Exit)->getIDom() == OrigPreheaderNode);
-      assert(DT->getNode(NewHeader)->getIDom() == OrigPreheaderNode);
-
-      // Update OrigHeader to be dominated by the new header block.
-      DT->changeImmediateDominator(OrigHeader, OrigLatch);
-    }
+    // Split edges as necessary to preserve LoopSimplify form.
 
     // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
     // thus is not a preheader anymore.
@@ -467,52 +475,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     PHBI->eraseFromParent();
 
     // With our CFG finalized, update DomTree if it is available.
-    if (DT) {
-      // Update OrigHeader to be dominated by the new header block.
-      DT->changeImmediateDominator(NewHeader, OrigPreheader);
-      DT->changeImmediateDominator(OrigHeader, OrigLatch);
-
-      // Brute force incremental dominator tree update. Call
-      // findNearestCommonDominator on all CFG predecessors of each child of the
-      // original header.
-      DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
-      SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
-                                                   OrigHeaderNode->end());
-      bool Changed;
-      do {
-        Changed = false;
-        for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I) {
-          DomTreeNode *Node = HeaderChildren[I];
-          BasicBlock *BB = Node->getBlock();
-
-          BasicBlock *NearestDom = nullptr;
-          for (BasicBlock *Pred : predecessors(BB)) {
-            // Consider only reachable basic blocks.
-            if (!DT->getNode(Pred))
-              continue;
-
-            if (!NearestDom) {
-              NearestDom = Pred;
-              continue;
-            }
-
-            NearestDom = DT->findNearestCommonDominator(NearestDom, Pred);
-            assert(NearestDom && "No NearestCommonDominator found");
-          }
-
-          assert(NearestDom && "Nearest dominator not found");
-
-          // Remember if this changes the DomTree.
-          if (Node->getIDom()->getBlock() != NearestDom) {
-            DT->changeImmediateDominator(BB, NearestDom);
-            Changed = true;
-          }
-        }
-
-        // If the dominator changed, this may have an effect on other
-        // predecessors, continue until we reach a fixpoint.
-      } while (Changed);
-    }
+    if (DT) DT->deleteEdge(OrigPreheader, Exit);
   }
 
   assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
@@ -671,7 +634,7 @@ bool LoopRotate::processLoop(Loop *L) {
   if ((MadeChange || SimplifiedLatch) && LoopMD)
     L->setLoopID(LoopMD);
 
-  return MadeChange;
+  return MadeChange || SimplifiedLatch;
 }
 
 LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication)
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 3638da118cb7..953854c8b7b7 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -65,7 +65,9 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/IVUsers.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -80,13 +82,18 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
@@ -98,7 +105,6 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
@@ -107,8 +113,8 @@
 #include <cstdint>
 #include <cstdlib>
 #include <iterator>
+#include <limits>
 #include <map>
-#include <tuple>
 #include <utility>
 
 using namespace llvm;
@@ -131,7 +137,7 @@ static cl::opt<bool> EnablePhiElim(
 
 // The flag adds instruction count to solutions cost comparision.
 static cl::opt<bool> InsnsCost(
-  "lsr-insns-cost", cl::Hidden, cl::init(false),
+  "lsr-insns-cost", cl::Hidden, cl::init(true),
   cl::desc("Add instruction count to a LSR cost model"));
 
 // Flag to choose how to narrow complex lsr solution
@@ -160,15 +166,14 @@ namespace {
 
 struct MemAccessTy {
   /// Used in situations where the accessed memory type is unknown.
-  static const unsigned UnknownAddressSpace = ~0u;
+  static const unsigned UnknownAddressSpace =
+      std::numeric_limits<unsigned>::max();
 
-  Type *MemTy;
-  unsigned AddrSpace;
+  Type *MemTy = nullptr;
+  unsigned AddrSpace = UnknownAddressSpace;
 
-  MemAccessTy() : MemTy(nullptr), AddrSpace(UnknownAddressSpace) {}
-
-  MemAccessTy(Type *Ty, unsigned AS) :
-    MemTy(Ty), AddrSpace(AS) {}
+  MemAccessTy() = default;
+  MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
 
   bool operator==(MemAccessTy Other) const {
     return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
@@ -195,11 +200,11 @@ public:
 
 } // end anonymous namespace
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void RegSortData::print(raw_ostream &OS) const {
   OS << "[NumUses=" << UsedByIndices.count() << ']';
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void RegSortData::dump() const {
   print(errs()); errs() << '\n';
 }
@@ -209,7 +214,7 @@ namespace {
 
 /// Map register candidates to information about how they are used.
 class RegUseTracker {
-  typedef DenseMap<const SCEV *, RegSortData> RegUsesTy;
+  using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
 
   RegUsesTy RegUsesMap;
   SmallVector<const SCEV *, 16> RegSequence;
@@ -225,8 +230,9 @@ public:
 
   void clear();
 
-  typedef SmallVectorImpl<const SCEV *>::iterator iterator;
-  typedef SmallVectorImpl<const SCEV *>::const_iterator const_iterator;
+  using iterator = SmallVectorImpl<const SCEV *>::iterator;
+  using const_iterator = SmallVectorImpl<const SCEV *>::const_iterator;
+
   iterator begin() { return RegSequence.begin(); }
   iterator end()   { return RegSequence.end(); }
   const_iterator begin() const { return RegSequence.begin(); }
@@ -299,16 +305,16 @@ namespace {
 /// satisfying a use. It may include broken-out immediates and scaled registers.
 struct Formula {
   /// Global base address used for complex addressing.
-  GlobalValue *BaseGV;
+  GlobalValue *BaseGV = nullptr;
 
   /// Base offset for complex addressing.
-  int64_t BaseOffset;
+  int64_t BaseOffset = 0;
 
   /// Whether any complex addressing has a base register.
-  bool HasBaseReg;
+  bool HasBaseReg = false;
 
   /// The scale of any complex addressing.
-  int64_t Scale;
+  int64_t Scale = 0;
 
   /// The list of "base" registers for this use. When this is non-empty. The
   /// canonical representation of a formula is
@@ -328,16 +334,14 @@ struct Formula {
 
   /// The 'scaled' register for this use. This should be non-null when Scale is
   /// not zero.
-  const SCEV *ScaledReg;
+  const SCEV *ScaledReg = nullptr;
 
   /// An additional constant offset which added near the use. This requires a
   /// temporary register, but the offset itself can live in an add immediate
   /// field rather than a register.
-  int64_t UnfoldedOffset;
+  int64_t UnfoldedOffset = 0;
 
-  Formula()
-      : BaseGV(nullptr), BaseOffset(0), HasBaseReg(false), Scale(0),
-        ScaledReg(nullptr), UnfoldedOffset(0) {}
+  Formula() = default;
 
   void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
 
@@ -562,6 +566,7 @@ bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
   return false;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void Formula::print(raw_ostream &OS) const {
   bool First = true;
   if (BaseGV) {
@@ -598,7 +603,6 @@ void Formula::print(raw_ostream &OS) const {
   }
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void Formula::dump() const {
   print(errs()); errs() << '\n';
 }
@@ -773,7 +777,8 @@ static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
 
 /// Returns true if the specified instruction is using the specified value as an
 /// address.
-static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
+static bool isAddressUse(const TargetTransformInfo &TTI,
+                         Instruction *Inst, Value *OperandVal) {
   bool isAddress = isa<LoadInst>(Inst);
   if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
     if (SI->getPointerOperand() == OperandVal)
@@ -782,11 +787,24 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
     // Addressing modes can also be folded into prefetches and a variety
     // of intrinsics.
     switch (II->getIntrinsicID()) {
-      default: break;
-      case Intrinsic::prefetch:
-        if (II->getArgOperand(0) == OperandVal)
+    case Intrinsic::memset:
+    case Intrinsic::prefetch:
+      if (II->getArgOperand(0) == OperandVal)
+        isAddress = true;
+      break;
+    case Intrinsic::memmove:
+    case Intrinsic::memcpy:
+      if (II->getArgOperand(0) == OperandVal ||
+          II->getArgOperand(1) == OperandVal)
+        isAddress = true;
+      break;
+    default: {
+      MemIntrinsicInfo IntrInfo;
+      if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
+        if (IntrInfo.PtrVal == OperandVal)
           isAddress = true;
-        break;
+      }
+    }
     }
   } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
     if (RMW->getPointerOperand() == OperandVal)
@@ -799,7 +817,8 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
 }
 
 /// Return the type of the memory being accessed.
-static MemAccessTy getAccessType(const Instruction *Inst) {
+static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
+                                 Instruction *Inst) {
   MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace);
   if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
     AccessTy.MemTy = SI->getOperand(0)->getType();
@@ -810,6 +829,21 @@ static MemAccessTy getAccessType(const Instruction *Inst) {
     AccessTy.AddrSpace = RMW->getPointerAddressSpace();
   } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
     AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
+  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::prefetch:
+      AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
+      break;
+    default: {
+      MemIntrinsicInfo IntrInfo;
+      if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
+        AccessTy.AddrSpace
+          = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
+      }
+
+      break;
+    }
+    }
   }
 
   // All pointers have the same requirements, so canonicalize them to an
@@ -948,6 +982,7 @@ class LSRUse;
 /// accurate cost model.
 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  const LSRUse &LU, const Formula &F);
+
 // Get the cost of the scaling factor used in F for LU.
 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
                                      const LSRUse &LU, const Formula &F,
@@ -1013,30 +1048,30 @@ private:
                            ScalarEvolution &SE, DominatorTree &DT,
                            SmallPtrSetImpl<const SCEV *> *LoserRegs);
 };
-  
+
 /// An operand value in an instruction which is to be replaced with some
 /// equivalent, possibly strength-reduced, replacement.
 struct LSRFixup {
   /// The instruction which will be updated.
-  Instruction *UserInst;
+  Instruction *UserInst = nullptr;
 
   /// The operand of the instruction which will be replaced. The operand may be
   /// used more than once; every instance will be replaced.
-  Value *OperandValToReplace;
+  Value *OperandValToReplace = nullptr;
 
   /// If this user is to use the post-incremented value of an induction
-  /// variable, this variable is non-null and holds the loop associated with the
+  /// variable, this set is non-empty and holds the loops associated with the
   /// induction variable.
   PostIncLoopSet PostIncLoops;
 
   /// A constant offset to be added to the LSRUse expression.  This allows
   /// multiple fixups to share the same LSRUse with different offsets, for
   /// example in an unrolled loop.
-  int64_t Offset;
+  int64_t Offset = 0;
 
-  bool isUseFullyOutsideLoop(const Loop *L) const;
+  LSRFixup() = default;
 
-  LSRFixup();
+  bool isUseFullyOutsideLoop(const Loop *L) const;
 
   void print(raw_ostream &OS) const;
   void dump() const;
@@ -1086,7 +1121,7 @@ public:
     // TODO: Add a generic icmp too?
   };
 
-  typedef PointerIntPair<const SCEV *, 2, KindType> SCEVUseKindPair;
+  using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
 
   KindType Kind;
   MemAccessTy AccessTy;
@@ -1095,25 +1130,25 @@ public:
   SmallVector<LSRFixup, 8> Fixups;
 
   /// Keep track of the min and max offsets of the fixups.
-  int64_t MinOffset;
-  int64_t MaxOffset;
+  int64_t MinOffset = std::numeric_limits<int64_t>::max();
+  int64_t MaxOffset = std::numeric_limits<int64_t>::min();
 
   /// This records whether all of the fixups using this LSRUse are outside of
   /// the loop, in which case some special-case heuristics may be used.
-  bool AllFixupsOutsideLoop;
+  bool AllFixupsOutsideLoop = true;
 
   /// RigidFormula is set to true to guarantee that this use will be associated
   /// with a single formula--the one that initially matched. Some SCEV
   /// expressions cannot be expanded. This allows LSR to consider the registers
   /// used by those expressions without the need to expand them later after
   /// changing the formula.
-  bool RigidFormula;
+  bool RigidFormula = false;
 
   /// This records the widest use type for any fixup using this
   /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
   /// fixup widths to be equivalent, because the narrower one may be relying on
   /// the implicit truncation to truncate away bogus bits.
-  Type *WidestFixupType;
+  Type *WidestFixupType = nullptr;
 
   /// A list of ways to build a value that can satisfy this user.  After the
   /// list is populated, one of these is selected heuristically and used to
@@ -1123,10 +1158,7 @@ public:
   /// The set of register candidates used by all formulae in this LSRUse.
   SmallPtrSet<const SCEV *, 4> Regs;
 
-  LSRUse(KindType K, MemAccessTy AT)
-      : Kind(K), AccessTy(AT), MinOffset(INT64_MAX), MaxOffset(INT64_MIN),
-        AllFixupsOutsideLoop(true), RigidFormula(false),
-        WidestFixupType(nullptr) {}
+  LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
 
   LSRFixup &getNewFixup() {
     Fixups.push_back(LSRFixup());
@@ -1140,7 +1172,7 @@ public:
     if (f.Offset < MinOffset)
       MinOffset = f.Offset;
   }
-  
+
   bool HasFormulaWithSameRegs(const Formula &F) const;
   float getNotSelectedProbability(const SCEV *Reg) const;
   bool InsertFormula(const Formula &F, const Loop &L);
@@ -1153,6 +1185,12 @@ public:
 
 } // end anonymous namespace
 
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+                                 LSRUse::KindType Kind, MemAccessTy AccessTy,
+                                 GlobalValue *BaseGV, int64_t BaseOffset,
+                                 bool HasBaseReg, int64_t Scale,
+                                 Instruction *Fixup = nullptr);
+
 /// Tally up interesting quantities from the given register.
 void Cost::RateRegister(const SCEV *Reg,
                         SmallPtrSetImpl<const SCEV *> &Regs,
@@ -1280,8 +1318,9 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
 
     // Check with target if this offset with this instruction is
     // specifically not supported.
-    if ((isa<LoadInst>(Fixup.UserInst) || isa<StoreInst>(Fixup.UserInst)) &&
-        !TTI.isFoldableMemAccessOffset(Fixup.UserInst, Offset))
+    if (LU.Kind == LSRUse::Address && Offset != 0 &&
+        !isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
+                              Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
       C.NumBaseAdds++;
   }
 
@@ -1325,14 +1364,14 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
 
 /// Set this cost to a losing value.
 void Cost::Lose() {
-  C.Insns = ~0u;
-  C.NumRegs = ~0u;
-  C.AddRecCost = ~0u;
-  C.NumIVMuls = ~0u;
-  C.NumBaseAdds = ~0u;
-  C.ImmCost = ~0u;
-  C.SetupCost = ~0u;
-  C.ScaleCost = ~0u;
+  C.Insns = std::numeric_limits<unsigned>::max();
+  C.NumRegs = std::numeric_limits<unsigned>::max();
+  C.AddRecCost = std::numeric_limits<unsigned>::max();
+  C.NumIVMuls = std::numeric_limits<unsigned>::max();
+  C.NumBaseAdds = std::numeric_limits<unsigned>::max();
+  C.ImmCost = std::numeric_limits<unsigned>::max();
+  C.SetupCost = std::numeric_limits<unsigned>::max();
+  C.ScaleCost = std::numeric_limits<unsigned>::max();
 }
 
 /// Choose the lower cost.
@@ -1343,6 +1382,7 @@ bool Cost::isLess(Cost &Other, const TargetTransformInfo &TTI) {
   return TTI.isLSRCostLess(C, Other.C);
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void Cost::print(raw_ostream &OS) const {
   if (InsnsCost)
     OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
@@ -1363,16 +1403,11 @@ void Cost::print(raw_ostream &OS) const {
     OS << ", plus " << C.SetupCost << " setup cost";
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void Cost::dump() const {
   print(errs()); errs() << '\n';
 }
 #endif
 
-LSRFixup::LSRFixup()
-  : UserInst(nullptr), OperandValToReplace(nullptr),
-    Offset(0) {}
-
 /// Test whether this fixup always uses its value outside of the given loop.
 bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
   // PHI nodes use their value in their incoming blocks.
@@ -1387,6 +1422,7 @@ bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
   return !L->contains(UserInst);
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void LSRFixup::print(raw_ostream &OS) const {
   OS << "UserInst=";
   // Store is common and interesting enough to be worth special-casing.
@@ -1410,7 +1446,6 @@ void LSRFixup::print(raw_ostream &OS) const {
     OS << ", Offset=" << Offset;
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void LSRFixup::dump() const {
   print(errs()); errs() << '\n';
 }
@@ -1493,6 +1528,7 @@ void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
       RegUses.dropRegister(S, LUIdx);
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void LSRUse::print(raw_ostream &OS) const {
   OS << "LSR Use: Kind=";
   switch (Kind) {
@@ -1526,7 +1562,6 @@ void LSRUse::print(raw_ostream &OS) const {
     OS << ", widest fixup type: " << *WidestFixupType;
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void LSRUse::dump() const {
   print(errs()); errs() << '\n';
 }
@@ -1535,11 +1570,12 @@ LLVM_DUMP_METHOD void LSRUse::dump() const {
 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  LSRUse::KindType Kind, MemAccessTy AccessTy,
                                  GlobalValue *BaseGV, int64_t BaseOffset,
-                                 bool HasBaseReg, int64_t Scale) {
+                                 bool HasBaseReg, int64_t Scale,
+                                 Instruction *Fixup/*= nullptr*/) {
   switch (Kind) {
   case LSRUse::Address:
     return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset,
-                                     HasBaseReg, Scale, AccessTy.AddrSpace);
+                                     HasBaseReg, Scale, AccessTy.AddrSpace, Fixup);
 
   case LSRUse::ICmpZero:
     // There's not even a target hook for querying whether it would be legal to
@@ -1564,7 +1600,8 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
       // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
       // Offs is the ICmp immediate.
       if (Scale == 0)
-        // The cast does the right thing with INT64_MIN.
+        // The cast does the right thing with
+        // std::numeric_limits<int64_t>::min().
         BaseOffset = -(uint64_t)BaseOffset;
       return TTI.isLegalICmpImmediate(BaseOffset);
     }
@@ -1645,6 +1682,16 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
 
 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  const LSRUse &LU, const Formula &F) {
+  // Target may want to look at the user instructions.
+  if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) {
+    for (const LSRFixup &Fixup : LU.Fixups)
+      if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
+                                (F.BaseOffset + Fixup.Offset), F.HasBaseReg,
+                                F.Scale, Fixup.UserInst))
+        return false;
+    return true;
+  }
+
   return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
                               LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
                               F.Scale);
@@ -1752,22 +1799,21 @@ struct IVInc {
   Value* IVOperand;
   const SCEV *IncExpr;
 
-  IVInc(Instruction *U, Value *O, const SCEV *E):
-    UserInst(U), IVOperand(O), IncExpr(E) {}
+  IVInc(Instruction *U, Value *O, const SCEV *E)
+      : UserInst(U), IVOperand(O), IncExpr(E) {}
 };
 
 // The list of IV increments in program order.  We typically add the head of a
 // chain without finding subsequent links.
 struct IVChain {
-  SmallVector<IVInc,1> Incs;
-  const SCEV *ExprBase;
-
-  IVChain() : ExprBase(nullptr) {}
+  SmallVector<IVInc, 1> Incs;
+  const SCEV *ExprBase = nullptr;
 
+  IVChain() = default;
   IVChain(const IVInc &Head, const SCEV *Base)
-    : Incs(1, Head), ExprBase(Base) {}
+      : Incs(1, Head), ExprBase(Base) {}
 
-  typedef SmallVectorImpl<IVInc>::const_iterator const_iterator;
+  using const_iterator = SmallVectorImpl<IVInc>::const_iterator;
 
   // Return the first increment in the chain.
   const_iterator begin() const {
@@ -1809,13 +1855,13 @@ class LSRInstance {
   LoopInfo &LI;
   const TargetTransformInfo &TTI;
   Loop *const L;
-  bool Changed;
+  bool Changed = false;
 
   /// This is the insert position that the current loop's induction variable
   /// increment should be placed. In simple loops, this is the latch block's
   /// terminator. But in more complicated cases, this is a position which will
   /// dominate all the in-loop post-increment users.
-  Instruction *IVIncInsertPos;
+  Instruction *IVIncInsertPos = nullptr;
 
   /// Interesting factors between use strides.
   ///
@@ -1861,7 +1907,7 @@ class LSRInstance {
   void CollectFixupsAndInitialFormulae();
 
   // Support for sharing of LSRUses between LSRFixups.
-  typedef DenseMap<LSRUse::SCEVUseKindPair, size_t> UseMapTy;
+  using UseMapTy = DenseMap<LSRUse::SCEVUseKindPair, size_t>;
   UseMapTy UseMap;
 
   bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
@@ -2002,6 +2048,14 @@ void LSRInstance::OptimizeShadowIV() {
     if (!PH) continue;
     if (PH->getNumIncomingValues() != 2) continue;
 
+    // If the calculation in integers overflows, the result in FP type will
+    // differ. So we only can do this transformation if we are guaranteed to not
+    // deal with overflowing values
+    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH));
+    if (!AR) continue;
+    if (IsSigned && !AR->hasNoSignedWrap()) continue;
+    if (!IsSigned && !AR->hasNoUnsignedWrap()) continue;
+
     Type *SrcTy = PH->getType();
     int Mantissa = DestTy->getFPMantissaWidth();
     if (Mantissa == -1) continue;
@@ -2094,7 +2148,7 @@ bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
 /// unfortunately this can come up even for loops where the user didn't use
 /// a C do-while loop. For example, seemingly well-behaved top-test loops
 /// will commonly be lowered like this:
-//
+///
 ///   if (n > 0) {
 ///     i = 0;
 ///     do {
@@ -2128,7 +2182,6 @@ bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
 /// This function solves this problem by detecting this type of loop and
 /// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
 /// the instructions for the maximum computation.
-///
 ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
   // Check that the loop matches the pattern we're looking for.
   if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
@@ -2268,7 +2321,6 @@ LSRInstance::OptimizeLoopTermCond() {
 
   // Otherwise treat this as a rotated loop.
   for (BasicBlock *ExitingBlock : ExitingBlocks) {
-
     // Get the terminating condition for the loop if possible.  If we
     // can, we want to change it to use a post-incremented version of its
     // induction variable, to allow coalescing the live ranges for the IV into
@@ -2333,7 +2385,7 @@ LSRInstance::OptimizeLoopTermCond() {
                 C->getValue().isMinSignedValue())
               goto decline_post_inc;
             // Check for possible scaled-address reuse.
-            MemAccessTy AccessTy = getAccessType(UI->getUser());
+            MemAccessTy AccessTy = getAccessType(TTI, UI->getUser());
             int64_t Scale = C->getSExtValue();
             if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
                                           /*BaseOffset=*/0,
@@ -2941,7 +2993,7 @@ void LSRInstance::CollectChains() {
       // consider leaf IV Users. This effectively rediscovers a portion of
       // IVUsers analysis but in program order this time.
       if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
-        continue;
+          continue;
 
       // Remove this instruction from any NearUsers set it may be in.
       for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
@@ -3003,13 +3055,13 @@ void LSRInstance::FinalizeChain(IVChain &Chain) {
 static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
                              Value *Operand, const TargetTransformInfo &TTI) {
   const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
-  if (!IncConst || !isAddressUse(UserInst, Operand))
+  if (!IncConst || !isAddressUse(TTI, UserInst, Operand))
     return false;
 
   if (IncConst->getAPInt().getMinSignedBits() > 64)
     return false;
 
-  MemAccessTy AccessTy = getAccessType(UserInst);
+  MemAccessTy AccessTy = getAccessType(TTI, UserInst);
   int64_t IncOffset = IncConst->getValue()->getSExtValue();
   if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
                         IncOffset, /*HaseBaseReg=*/false))
@@ -3136,14 +3188,14 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
 
     LSRUse::KindType Kind = LSRUse::Basic;
     MemAccessTy AccessTy;
-    if (isAddressUse(UserInst, U.getOperandValToReplace())) {
+    if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
       Kind = LSRUse::Address;
-      AccessTy = getAccessType(UserInst);
+      AccessTy = getAccessType(TTI, UserInst);
     }
 
     const SCEV *S = IU.getExpr(U);
     PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
-    
+
     // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
     // (N - i == 0), and this allows (N - i) to be the expression that we work
     // with rather than just N or i, so we can consider the register
@@ -3432,7 +3484,6 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
   for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
                                                      JE = AddOps.end();
        J != JE; ++J) {
-
     // Loop-variant "unknown" values are uninteresting; we won't be able to
     // do anything meaningful with them.
     if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
@@ -3654,12 +3705,18 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
   // Don't do this if there is more than one offset.
   if (LU.MinOffset != LU.MaxOffset) return;
 
+  // Check if transformation is valid. It is illegal to multiply pointer.
+  if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
+    return;
+  for (const SCEV *BaseReg : Base.BaseRegs)
+    if (BaseReg->getType()->isPointerTy())
+      return;
   assert(!Base.BaseGV && "ICmpZero use is not legal!");
 
   // Check each interesting stride.
   for (int64_t Factor : Factors) {
     // Check that the multiplication doesn't overflow.
-    if (Base.BaseOffset == INT64_MIN && Factor == -1)
+    if (Base.BaseOffset == std::numeric_limits<int64_t>::min() && Factor == -1)
       continue;
     int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor;
     if (NewBaseOffset / Factor != Base.BaseOffset)
@@ -3671,7 +3728,7 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
 
     // Check that multiplying with the use offset doesn't overflow.
     int64_t Offset = LU.MinOffset;
-    if (Offset == INT64_MIN && Factor == -1)
+    if (Offset == std::numeric_limits<int64_t>::min() && Factor == -1)
       continue;
     Offset = (uint64_t)Offset * Factor;
     if (Offset / Factor != LU.MinOffset)
@@ -3709,7 +3766,8 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
 
     // Check that multiplying with the unfolded offset doesn't overflow.
     if (F.UnfoldedOffset != 0) {
-      if (F.UnfoldedOffset == INT64_MIN && Factor == -1)
+      if (F.UnfoldedOffset == std::numeric_limits<int64_t>::min() &&
+          Factor == -1)
         continue;
       F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
       if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
@@ -3833,7 +3891,7 @@ struct WorkItem {
   const SCEV *OrigReg;
 
   WorkItem(size_t LI, int64_t I, const SCEV *R)
-    : LUIdx(LI), Imm(I), OrigReg(R) {}
+      : LUIdx(LI), Imm(I), OrigReg(R) {}
 
   void print(raw_ostream &OS) const;
   void dump() const;
@@ -3841,12 +3899,12 @@ struct WorkItem {
 
 } // end anonymous namespace
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void WorkItem::print(raw_ostream &OS) const {
   OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
      << " , add offset " << Imm;
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void WorkItem::dump() const {
   print(errs()); errs() << '\n';
 }
@@ -3856,7 +3914,8 @@ LLVM_DUMP_METHOD void WorkItem::dump() const {
 /// opportunities between them.
 void LSRInstance::GenerateCrossUseConstantOffsets() {
   // Group the registers by their value without any added constant offset.
-  typedef std::map<int64_t, const SCEV *> ImmMapTy;
+  using ImmMapTy = std::map<int64_t, const SCEV *>;
+
   DenseMap<const SCEV *, ImmMapTy> Map;
   DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
   SmallVector<const SCEV *, 8> Sequence;
@@ -4060,8 +4119,9 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
 
   // Collect the best formula for each unique set of shared registers. This
   // is reset for each use.
-  typedef DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>
-    BestFormulaeTy;
+  using BestFormulaeTy =
+      DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>;
+
   BestFormulaeTy BestFormulae;
 
   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
@@ -4148,7 +4208,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
 }
 
 // This is a rough guess that seems to work fairly well.
-static const size_t ComplexityLimit = UINT16_MAX;
+static const size_t ComplexityLimit = std::numeric_limits<uint16_t>::max();
 
 /// Estimate the worst-case number of solutions the solver might have to
 /// consider. It almost never considers this many solutions because it prune the
@@ -4267,7 +4327,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
         LUThatHas->pushFixup(Fixup);
         DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
       }
-      
+
       // Delete formulae from the new use which are no longer legal.
       bool Any = false;
       for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
@@ -4332,7 +4392,8 @@ void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
                   "from the Formulae with the same Scale and ScaledReg.\n");
 
   // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
-  typedef DenseMap<std::pair<const SCEV *, int64_t>, size_t> BestFormulaeTy;
+  using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
+
   BestFormulaeTy BestFormulae;
 #ifndef NDEBUG
   bool ChangedFormulae = false;
@@ -4454,7 +4515,6 @@ void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
 /// Use3:
 ///  reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
 ///  reg(c) + reg({b,+,1})          1 + 2/3
-
 void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
   if (EstimateSearchSpaceComplexity() < ComplexityLimit)
     return;
@@ -4549,7 +4609,6 @@ void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
   print_uses(dbgs()));
 }
 
-
 /// Pick a register which seems likely to be profitable, and then in any use
 /// which has any reference to that register, delete all formulae which do not
 /// reference that register.
@@ -5196,8 +5255,7 @@ void LSRInstance::ImplementSolution(
 LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
                          DominatorTree &DT, LoopInfo &LI,
                          const TargetTransformInfo &TTI)
-    : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L), Changed(false),
-      IVIncInsertPos(nullptr) {
+    : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L) {
   // If LoopSimplify form is not available, stay out of trouble.
   if (!L->isLoopSimplifyForm())
     return;
@@ -5302,6 +5360,7 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
   ImplementSolution(Solution);
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
   if (Factors.empty() && Types.empty()) return;
 
@@ -5352,7 +5411,6 @@ void LSRInstance::print(raw_ostream &OS) const {
   print_uses(OS);
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void LSRInstance::dump() const {
   print(errs()); errs() << '\n';
 }
@@ -5448,6 +5506,7 @@ PreservedAnalyses LoopStrengthReducePass::run(Loop &L, LoopAnalysisManager &AM,
 }
 
 char LoopStrengthReduce::ID = 0;
+
 INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
                       "Loop Strength Reduction", false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 530a68424d5c..7b1d6446a24a 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -1,4 +1,4 @@
-//===-- LoopUnroll.cpp - Loop unroller pass -------------------------------===//
+//===- LoopUnroll.cpp - Loop unroller pass --------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,29 +13,55 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/LoopUnrollPass.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/LoopUnrollAnalyzer.h"
-#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/IR/DataLayout.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
-#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
-#include <climits>
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <tuple>
 #include <utility>
 
 using namespace llvm;
@@ -79,6 +105,10 @@ static cl::opt<unsigned> UnrollFullMaxCount(
     cl::desc(
         "Set the max unroll count for full unrolling, for testing purposes"));
 
+static cl::opt<unsigned> UnrollPeelCount(
+    "unroll-peel-count", cl::Hidden,
+    cl::desc("Set the unroll peeling count, for testing purposes"));
+
 static cl::opt<bool>
     UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
                        cl::desc("Allows loops to be partially unrolled until "
@@ -114,6 +144,10 @@ static cl::opt<bool>
                        cl::desc("Allows loops to be peeled when the dynamic "
                                 "trip count is known to be low."));
 
+static cl::opt<bool> UnrollUnrollRemainder(
+  "unroll-remainder", cl::Hidden,
+  cl::desc("Allow the loop remainder to be unrolled."));
+
 // This option isn't ever intended to be enabled, it serves to allow
 // experiments to check the assumptions about when this kind of revisit is
 // necessary.
@@ -126,7 +160,7 @@ static cl::opt<bool> UnrollRevisitChildLoops(
 /// A magic value for use with the Threshold parameter to indicate
 /// that the loop unroll should be performed regardless of how much
 /// code expansion would result.
-static const unsigned NoThreshold = UINT_MAX;
+static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();
 
 /// Gather the various unrolling parameters based on the defaults, compiler
 /// flags, TTI overrides and user specified parameters.
@@ -134,7 +168,7 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
     Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel,
     Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
     Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
-    Optional<bool> UserUpperBound) {
+    Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling) {
   TargetTransformInfo::UnrollingPreferences UP;
 
   // Set up the defaults
@@ -146,12 +180,13 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
   UP.Count = 0;
   UP.PeelCount = 0;
   UP.DefaultUnrollRuntimeCount = 8;
-  UP.MaxCount = UINT_MAX;
-  UP.FullUnrollMaxCount = UINT_MAX;
+  UP.MaxCount = std::numeric_limits<unsigned>::max();
+  UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max();
   UP.BEInsns = 2;
   UP.Partial = false;
   UP.Runtime = false;
   UP.AllowRemainder = true;
+  UP.UnrollRemainder = false;
   UP.AllowExpensiveTripCount = false;
   UP.Force = false;
   UP.UpperBound = false;
@@ -177,6 +212,8 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
     UP.MaxCount = UnrollMaxCount;
   if (UnrollFullMaxCount.getNumOccurrences() > 0)
     UP.FullUnrollMaxCount = UnrollFullMaxCount;
+  if (UnrollPeelCount.getNumOccurrences() > 0)
+    UP.PeelCount = UnrollPeelCount;
   if (UnrollAllowPartial.getNumOccurrences() > 0)
     UP.Partial = UnrollAllowPartial;
   if (UnrollAllowRemainder.getNumOccurrences() > 0)
@@ -187,6 +224,8 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
     UP.UpperBound = false;
   if (UnrollAllowPeeling.getNumOccurrences() > 0)
     UP.AllowPeeling = UnrollAllowPeeling;
+  if (UnrollUnrollRemainder.getNumOccurrences() > 0)
+    UP.UnrollRemainder = UnrollUnrollRemainder;
 
   // Apply user values provided by argument
   if (UserThreshold.hasValue()) {
@@ -201,11 +240,14 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
     UP.Runtime = *UserRuntime;
   if (UserUpperBound.hasValue())
     UP.UpperBound = *UserUpperBound;
+  if (UserAllowPeeling.hasValue())
+    UP.AllowPeeling = *UserAllowPeeling;
 
   return UP;
 }
 
 namespace {
+
 /// A struct to densely store the state of an instruction after unrolling at
 /// each iteration.
 ///
@@ -221,25 +263,27 @@ struct UnrolledInstState {
 
 /// Hashing and equality testing for a set of the instruction states.
 struct UnrolledInstStateKeyInfo {
-  typedef DenseMapInfo<Instruction *> PtrInfo;
-  typedef DenseMapInfo<std::pair<Instruction *, int>> PairInfo;
+  using PtrInfo = DenseMapInfo<Instruction *>;
+  using PairInfo = DenseMapInfo<std::pair<Instruction *, int>>;
+
   static inline UnrolledInstState getEmptyKey() {
     return {PtrInfo::getEmptyKey(), 0, 0, 0};
   }
+
   static inline UnrolledInstState getTombstoneKey() {
     return {PtrInfo::getTombstoneKey(), 0, 0, 0};
   }
+
   static inline unsigned getHashValue(const UnrolledInstState &S) {
     return PairInfo::getHashValue({S.I, S.Iteration});
   }
+
   static inline bool isEqual(const UnrolledInstState &LHS,
                              const UnrolledInstState &RHS) {
     return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration});
   }
 };
-}
 
-namespace {
 struct EstimatedUnrollCost {
   /// \brief The estimated cost after unrolling.
   unsigned UnrolledCost;
@@ -248,7 +292,8 @@ struct EstimatedUnrollCost {
   /// rolled form.
   unsigned RolledDynamicCost;
 };
-}
+
+} // end anonymous namespace
 
 /// \brief Figure out if the loop is worth full unrolling.
 ///
@@ -270,7 +315,8 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
   // We want to be able to scale offsets by the trip count and add more offsets
   // to them without checking for overflows, and we already don't want to
   // analyze *massive* trip counts, so we force the max to be reasonably small.
-  assert(UnrollMaxIterationsCountToAnalyze < (INT_MAX / 2) &&
+  assert(UnrollMaxIterationsCountToAnalyze <
+             (unsigned)(std::numeric_limits<int>::max() / 2) &&
          "The unroll iterations max is too large!");
 
   // Only analyze inner loops. We can't properly estimate cost of nested loops
@@ -633,43 +679,6 @@ static unsigned UnrollCountPragmaValue(const Loop *L) {
   return 0;
 }
 
-// Remove existing unroll metadata and add unroll disable metadata to
-// indicate the loop has already been unrolled.  This prevents a loop
-// from being unrolled more than is directed by a pragma if the loop
-// unrolling pass is run more than once (which it generally is).
-static void SetLoopAlreadyUnrolled(Loop *L) {
-  MDNode *LoopID = L->getLoopID();
-  // First remove any existing loop unrolling metadata.
-  SmallVector<Metadata *, 4> MDs;
-  // Reserve first location for self reference to the LoopID metadata node.
-  MDs.push_back(nullptr);
-
-  if (LoopID) {
-    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
-      bool IsUnrollMetadata = false;
-      MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
-      if (MD) {
-        const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
-        IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll.");
-      }
-      if (!IsUnrollMetadata)
-        MDs.push_back(LoopID->getOperand(i));
-    }
-  }
-
-  // Add unroll(disable) metadata to disable future unrolling.
-  LLVMContext &Context = L->getHeader()->getContext();
-  SmallVector<Metadata *, 1> DisableOperands;
-  DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable"));
-  MDNode *DisableNode = MDNode::get(Context, DisableOperands);
-  MDs.push_back(DisableNode);
-
-  MDNode *NewLoopID = MDNode::get(Context, MDs);
-  // Set operand 0 to refer to the loop id itself.
-  NewLoopID->replaceOperandWith(0, NewLoopID);
-  L->setLoopID(NewLoopID);
-}
-
 // Computes the boosting factor for complete unrolling.
 // If fully unrolling the loop would save a lot of RolledDynamicCost, it would
 // be beneficial to fully unroll the loop even if unrolledcost is large. We
@@ -677,7 +686,7 @@ static void SetLoopAlreadyUnrolled(Loop *L) {
 // the unroll threshold.
 static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost,
                                             unsigned MaxPercentThresholdBoost) {
-  if (Cost.RolledDynamicCost >= UINT_MAX / 100)
+  if (Cost.RolledDynamicCost >= std::numeric_limits<unsigned>::max() / 100)
     return 100;
   else if (Cost.UnrolledCost != 0)
     // The boosting factor is RolledDynamicCost / UnrolledCost
@@ -826,11 +835,14 @@ static bool computeUnrollCount(
       }
       if (UP.Count < 2) {
         if (PragmaEnableUnroll)
-          ORE->emit(
-              OptimizationRemarkMissed(DEBUG_TYPE, "UnrollAsDirectedTooLarge",
-                                       L->getStartLoc(), L->getHeader())
-              << "Unable to unroll loop as directed by unroll(enable) pragma "
-                 "because unrolled size is too large.");
+          ORE->emit([&]() {
+            return OptimizationRemarkMissed(DEBUG_TYPE,
+                                            "UnrollAsDirectedTooLarge",
+                                            L->getStartLoc(), L->getHeader())
+                   << "Unable to unroll loop as directed by unroll(enable) "
+                      "pragma "
+                      "because unrolled size is too large.";
+          });
         UP.Count = 0;
       }
     } else {
@@ -840,22 +852,27 @@ static bool computeUnrollCount(
       UP.Count = UP.MaxCount;
     if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
         UP.Count != TripCount)
-      ORE->emit(
-          OptimizationRemarkMissed(DEBUG_TYPE, "FullUnrollAsDirectedTooLarge",
-                                   L->getStartLoc(), L->getHeader())
-          << "Unable to fully unroll loop as directed by unroll pragma because "
-             "unrolled size is too large.");
+      ORE->emit([&]() {
+        return OptimizationRemarkMissed(DEBUG_TYPE,
+                                        "FullUnrollAsDirectedTooLarge",
+                                        L->getStartLoc(), L->getHeader())
+               << "Unable to fully unroll loop as directed by unroll pragma "
+                  "because "
+                  "unrolled size is too large.";
+      });
     return ExplicitUnroll;
   }
   assert(TripCount == 0 &&
          "All cases when TripCount is constant should be covered here.");
   if (PragmaFullUnroll)
-    ORE->emit(
-        OptimizationRemarkMissed(DEBUG_TYPE,
-                                 "CantFullUnrollAsDirectedRuntimeTripCount",
-                                 L->getStartLoc(), L->getHeader())
-        << "Unable to fully unroll loop as directed by unroll(full) pragma "
-           "because loop has a runtime trip count.");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(
+                 DEBUG_TYPE, "CantFullUnrollAsDirectedRuntimeTripCount",
+                 L->getStartLoc(), L->getHeader())
+             << "Unable to fully unroll loop as directed by unroll(full) "
+                "pragma "
+                "because loop has a runtime trip count.";
+    });
 
   // 6th priority is runtime unrolling.
   // Don't unroll a runtime trip count loop when it is disabled.
@@ -904,19 +921,23 @@ static bool computeUnrollCount(
                     "multiple, "
                  << TripMultiple << ".  Reducing unroll count from "
                  << OrigCount << " to " << UP.Count << ".\n");
+
     using namespace ore;
+
     if (PragmaCount > 0 && !UP.AllowRemainder)
-      ORE->emit(
-          OptimizationRemarkMissed(DEBUG_TYPE,
-                                   "DifferentUnrollCountFromDirected",
-                                   L->getStartLoc(), L->getHeader())
-          << "Unable to unroll loop the number of times directed by "
-             "unroll_count pragma because remainder loop is restricted "
-             "(that could architecture specific or because the loop "
-             "contains a convergent instruction) and so must have an unroll "
-             "count that divides the loop trip multiple of "
-          << NV("TripMultiple", TripMultiple) << ".  Unrolling instead "
-          << NV("UnrollCount", UP.Count) << " time(s).");
+      ORE->emit([&]() {
+        return OptimizationRemarkMissed(DEBUG_TYPE,
+                                        "DifferentUnrollCountFromDirected",
+                                        L->getStartLoc(), L->getHeader())
+               << "Unable to unroll loop the number of times directed by "
+                  "unroll_count pragma because remainder loop is restricted "
+                  "(that could architecture specific or because the loop "
+                  "contains a convergent instruction) and so must have an "
+                  "unroll "
+                  "count that divides the loop trip multiple of "
+               << NV("TripMultiple", TripMultiple) << ".  Unrolling instead "
+               << NV("UnrollCount", UP.Count) << " time(s).";
+      });
   }
 
   if (UP.Count > UP.MaxCount)
@@ -927,23 +948,21 @@ static bool computeUnrollCount(
   return ExplicitUnroll;
 }
 
-static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
-                            ScalarEvolution &SE, const TargetTransformInfo &TTI,
-                            AssumptionCache &AC, OptimizationRemarkEmitter &ORE,
-                            bool PreserveLCSSA, int OptLevel,
-                            Optional<unsigned> ProvidedCount,
-                            Optional<unsigned> ProvidedThreshold,
-                            Optional<bool> ProvidedAllowPartial,
-                            Optional<bool> ProvidedRuntime,
-                            Optional<bool> ProvidedUpperBound) {
+static LoopUnrollResult tryToUnrollLoop(
+    Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
+    const TargetTransformInfo &TTI, AssumptionCache &AC,
+    OptimizationRemarkEmitter &ORE, bool PreserveLCSSA, int OptLevel,
+    Optional<unsigned> ProvidedCount, Optional<unsigned> ProvidedThreshold,
+    Optional<bool> ProvidedAllowPartial, Optional<bool> ProvidedRuntime,
+    Optional<bool> ProvidedUpperBound, Optional<bool> ProvidedAllowPeeling) {
   DEBUG(dbgs() << "Loop Unroll: F[" << L->getHeader()->getParent()->getName()
                << "] Loop %" << L->getHeader()->getName() << "\n");
-  if (HasUnrollDisablePragma(L)) 
-    return false;
-  if (!L->isLoopSimplifyForm()) { 
+  if (HasUnrollDisablePragma(L))
+    return LoopUnrollResult::Unmodified;
+  if (!L->isLoopSimplifyForm()) {
     DEBUG(
         dbgs() << "  Not unrolling loop which is not in loop-simplify form.\n");
-    return false;
+    return LoopUnrollResult::Unmodified;
   }
 
   unsigned NumInlineCandidates;
@@ -951,21 +970,22 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   bool Convergent;
   TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
       L, SE, TTI, OptLevel, ProvidedThreshold, ProvidedCount,
-      ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound);
+      ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
+      ProvidedAllowPeeling);
   // Exit early if unrolling is disabled.
   if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0))
-    return false;
+    return LoopUnrollResult::Unmodified;
   unsigned LoopSize = ApproximateLoopSize(
       L, NumInlineCandidates, NotDuplicatable, Convergent, TTI, &AC, UP.BEInsns);
   DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
   if (NotDuplicatable) {
     DEBUG(dbgs() << "  Not unrolling loop which contains non-duplicatable"
                  << " instructions.\n");
-    return false;
+    return LoopUnrollResult::Unmodified;
   }
   if (NumInlineCandidates != 0) {
     DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
-    return false;
+    return LoopUnrollResult::Unmodified;
   }
 
   // Find trip count and trip multiple if count is not available
@@ -1024,41 +1044,35 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
       computeUnrollCount(L, TTI, DT, LI, SE, &ORE, TripCount, MaxTripCount,
                          TripMultiple, LoopSize, UP, UseUpperBound);
   if (!UP.Count)
-    return false;
+    return LoopUnrollResult::Unmodified;
   // Unroll factor (Count) must be less or equal to TripCount.
   if (TripCount && UP.Count > TripCount)
     UP.Count = TripCount;
 
   // Unroll the loop.
-  if (!UnrollLoop(L, UP.Count, TripCount, UP.Force, UP.Runtime,
-                  UP.AllowExpensiveTripCount, UseUpperBound, MaxOrZero,
-                  TripMultiple, UP.PeelCount, LI, &SE, &DT, &AC, &ORE,
-                  PreserveLCSSA))
-    return false;
+  LoopUnrollResult UnrollResult = UnrollLoop(
+      L, UP.Count, TripCount, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount,
+      UseUpperBound, MaxOrZero, TripMultiple, UP.PeelCount, UP.UnrollRemainder,
+      LI, &SE, &DT, &AC, &ORE, PreserveLCSSA);
+  if (UnrollResult == LoopUnrollResult::Unmodified)
+    return LoopUnrollResult::Unmodified;
 
   // If loop has an unroll count pragma or unrolled by explicitly set count
   // mark loop as unrolled to prevent unrolling beyond that requested.
   // If the loop was peeled, we already "used up" the profile information
   // we had, so we don't want to unroll or peel again.
-  if (IsCountSetExplicitly || UP.PeelCount)
-    SetLoopAlreadyUnrolled(L);
+  if (UnrollResult != LoopUnrollResult::FullyUnrolled &&
+      (IsCountSetExplicitly || UP.PeelCount))
+    L->setLoopAlreadyUnrolled();
 
-  return true;
+  return UnrollResult;
 }
 
 namespace {
+
 class LoopUnroll : public LoopPass {
 public:
   static char ID; // Pass ID, replacement for typeid
-  LoopUnroll(int OptLevel = 2, Optional<unsigned> Threshold = None,
-             Optional<unsigned> Count = None,
-             Optional<bool> AllowPartial = None, Optional<bool> Runtime = None,
-             Optional<bool> UpperBound = None)
-      : LoopPass(ID), OptLevel(OptLevel), ProvidedCount(std::move(Count)),
-        ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
-        ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound) {
-    initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
-  }
 
   int OptLevel;
   Optional<unsigned> ProvidedCount;
@@ -1066,8 +1080,21 @@ public:
   Optional<bool> ProvidedAllowPartial;
   Optional<bool> ProvidedRuntime;
   Optional<bool> ProvidedUpperBound;
+  Optional<bool> ProvidedAllowPeeling;
 
-  bool runOnLoop(Loop *L, LPPassManager &) override {
+  LoopUnroll(int OptLevel = 2, Optional<unsigned> Threshold = None,
+             Optional<unsigned> Count = None,
+             Optional<bool> AllowPartial = None, Optional<bool> Runtime = None,
+             Optional<bool> UpperBound = None,
+             Optional<bool> AllowPeeling = None)
+      : LoopPass(ID), OptLevel(OptLevel), ProvidedCount(std::move(Count)),
+        ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
+        ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound),
+        ProvidedAllowPeeling(AllowPeeling) {
+    initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
     if (skipLoop(L))
       return false;
 
@@ -1085,15 +1112,19 @@ public:
     OptimizationRemarkEmitter ORE(&F);
     bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
 
-    return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, OptLevel,
-                           ProvidedCount, ProvidedThreshold,
-                           ProvidedAllowPartial, ProvidedRuntime,
-                           ProvidedUpperBound);
+    LoopUnrollResult Result = tryToUnrollLoop(
+        L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, OptLevel, ProvidedCount,
+        ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime,
+        ProvidedUpperBound, ProvidedAllowPeeling);
+
+    if (Result == LoopUnrollResult::FullyUnrolled)
+      LPM.markLoopAsDeleted(*L);
+
+    return Result != LoopUnrollResult::Unmodified;
   }
 
   /// This transformation requires natural loop information & requires that
   /// loop preheaders be inserted into the CFG...
-  ///
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
@@ -1102,9 +1133,11 @@ public:
     getLoopAnalysisUsage(AU);
   }
 };
-}
+
+} // end anonymous namespace
 
 char LoopUnroll::ID = 0;
+
 INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(LoopPass)
@@ -1112,8 +1145,8 @@ INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
 
 Pass *llvm::createLoopUnrollPass(int OptLevel, int Threshold, int Count,
-                                 int AllowPartial, int Runtime,
-                                 int UpperBound) {
+                                 int AllowPartial, int Runtime, int UpperBound,
+                                 int AllowPeeling) {
   // TODO: It would make more sense for this function to take the optionals
   // directly, but that's dangerous since it would silently break out of tree
   // callers.
@@ -1122,16 +1155,17 @@ Pass *llvm::createLoopUnrollPass(int OptLevel, int Threshold, int Count,
       Count == -1 ? None : Optional<unsigned>(Count),
       AllowPartial == -1 ? None : Optional<bool>(AllowPartial),
       Runtime == -1 ? None : Optional<bool>(Runtime),
-      UpperBound == -1 ? None : Optional<bool>(UpperBound));
+      UpperBound == -1 ? None : Optional<bool>(UpperBound),
+      AllowPeeling == -1 ? None : Optional<bool>(AllowPeeling));
 }
 
 Pass *llvm::createSimpleLoopUnrollPass(int OptLevel) {
-  return llvm::createLoopUnrollPass(OptLevel, -1, -1, 0, 0, 0);
+  return createLoopUnrollPass(OptLevel, -1, -1, 0, 0, 0, 0);
 }
 
-PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
-                                      LoopStandardAnalysisResults &AR,
-                                      LPMUpdater &Updater) {
+PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
+                                          LoopStandardAnalysisResults &AR,
+                                          LPMUpdater &Updater) {
   const auto &FAM =
       AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
   Function *F = L.getHeader()->getParent();
@@ -1139,8 +1173,9 @@ PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
   auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
   // FIXME: This should probably be optional rather than required.
   if (!ORE)
-    report_fatal_error("LoopUnrollPass: OptimizationRemarkEmitterAnalysis not "
-                       "cached at a higher level");
+    report_fatal_error(
+        "LoopFullUnrollPass: OptimizationRemarkEmitterAnalysis not "
+        "cached at a higher level");
 
   // Keep track of the previous loop structure so we can identify new loops
   // created by unrolling.
@@ -1151,17 +1186,14 @@ PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
   else
     OldLoops.insert(AR.LI.begin(), AR.LI.end());
 
-  // The API here is quite complex to call, but there are only two interesting
-  // states we support: partial and full (or "simple") unrolling. However, to
-  // enable these things we actually pass "None" in for the optional to avoid
-  // providing an explicit choice.
-  Optional<bool> AllowPartialParam, RuntimeParam, UpperBoundParam;
-  if (!AllowPartialUnrolling)
-    AllowPartialParam = RuntimeParam = UpperBoundParam = false;
-  bool Changed = tryToUnrollLoop(
-      &L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE,
-      /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None,
-      /*Threshold*/ None, AllowPartialParam, RuntimeParam, UpperBoundParam);
+  std::string LoopName = L.getName();
+
+  bool Changed =
+      tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE,
+                      /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None,
+                      /*Threshold*/ None, /*AllowPartial*/ false,
+                      /*Runtime*/ false, /*UpperBound*/ false,
+                      /*AllowPeeling*/ false) != LoopUnrollResult::Unmodified;
   if (!Changed)
     return PreservedAnalyses::all();
 
@@ -1172,17 +1204,13 @@ PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
 #endif
 
   // Unrolling can do several things to introduce new loops into a loop nest:
-  // - Partial unrolling clones child loops within the current loop. If it
-  //   uses a remainder, then it can also create any number of sibling loops.
   // - Full unrolling clones child loops within the current loop but then
   //   removes the current loop making all of the children appear to be new
   //   sibling loops.
-  // - Loop peeling can directly introduce new sibling loops by peeling one
-  //   iteration.
   //
-  // When a new loop appears as a sibling loop, either from peeling an
-  // iteration or fully unrolling, its nesting structure has fundamentally
-  // changed and we want to revisit it to reflect that.
+  // When a new loop appears as a sibling loop after fully unrolling,
+  // its nesting structure has fundamentally changed and we want to revisit
+  // it to reflect that.
   //
   // When unrolling has removed the current loop, we need to tell the
   // infrastructure that it is gone.
@@ -1209,13 +1237,11 @@ PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
   Updater.addSiblingLoops(SibLoops);
 
   if (!IsCurrentLoopValid) {
-    Updater.markLoopAsDeleted(L);
+    Updater.markLoopAsDeleted(L, LoopName);
   } else {
     // We can only walk child loops if the current loop remained valid.
     if (UnrollRevisitChildLoops) {
-      // Walk *all* of the child loops. This is a highly speculative mode
-      // anyways so look for any simplifications that arose from partial
-      // unrolling or peeling off of iterations.
+      // Walk *all* of the child loops.
       SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end());
       Updater.addChildLoops(ChildLoops);
     }
@@ -1223,3 +1249,105 @@ PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
 
   return getLoopPassPreservedAnalyses();
 }
+
+template <typename RangeT>
+static SmallVector<Loop *, 8> appendLoopsToWorklist(RangeT &&Loops) {
+  SmallVector<Loop *, 8> Worklist;
+  // We use an internal worklist to build up the preorder traversal without
+  // recursion.
+  SmallVector<Loop *, 4> PreOrderLoops, PreOrderWorklist;
+
+  for (Loop *RootL : Loops) {
+    assert(PreOrderLoops.empty() && "Must start with an empty preorder walk.");
+    assert(PreOrderWorklist.empty() &&
+           "Must start with an empty preorder walk worklist.");
+    PreOrderWorklist.push_back(RootL);
+    do {
+      Loop *L = PreOrderWorklist.pop_back_val();
+      PreOrderWorklist.append(L->begin(), L->end());
+      PreOrderLoops.push_back(L);
+    } while (!PreOrderWorklist.empty());
+
+    Worklist.append(PreOrderLoops.begin(), PreOrderLoops.end());
+    PreOrderLoops.clear();
+  }
+  return Worklist;
+}
+
+PreservedAnalyses LoopUnrollPass::run(Function &F,
+                                      FunctionAnalysisManager &AM) {
+  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
+  LoopAnalysisManager *LAM = nullptr;
+  if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F))
+    LAM = &LAMProxy->getManager();
+
+  const ModuleAnalysisManager &MAM =
+      AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
+  ProfileSummaryInfo *PSI =
+      MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+
+  bool Changed = false;
+
+  // The unroller requires loops to be in simplified form, and also needs LCSSA.
+  // Since simplification may add new inner loops, it has to run before the
+  // legality and profitability checks. This means running the loop unroller
+  // will simplify all loops, regardless of whether anything end up being
+  // unrolled.
+  for (auto &L : LI) {
+    Changed |= simplifyLoop(L, &DT, &LI, &SE, &AC, false /* PreserveLCSSA */);
+    Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
+  }
+
+  SmallVector<Loop *, 8> Worklist = appendLoopsToWorklist(LI);
+
+  while (!Worklist.empty()) {
+    // Because the LoopInfo stores the loops in RPO, we walk the worklist
+    // from back to front so that we work forward across the CFG, which
+    // for unrolling is only needed to get optimization remarks emitted in
+    // a forward order.
+    Loop &L = *Worklist.pop_back_val();
+#ifndef NDEBUG
+    Loop *ParentL = L.getParentLoop();
+#endif
+
+    // The API here is quite complex to call, but there are only two interesting
+    // states we support: partial and full (or "simple") unrolling. However, to
+    // enable these things we actually pass "None" in for the optional to avoid
+    // providing an explicit choice.
+    Optional<bool> AllowPartialParam, RuntimeParam, UpperBoundParam,
+        AllowPeeling;
+    // Check if the profile summary indicates that the profiled application
+    // has a huge working set size, in which case we disable peeling to avoid
+    // bloating it further.
+    if (PSI && PSI->hasHugeWorkingSetSize())
+      AllowPeeling = false;
+    std::string LoopName = L.getName();
+    LoopUnrollResult Result =
+        tryToUnrollLoop(&L, DT, &LI, SE, TTI, AC, ORE,
+                        /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None,
+                        /*Threshold*/ None, AllowPartialParam, RuntimeParam,
+                        UpperBoundParam, AllowPeeling);
+    Changed |= Result != LoopUnrollResult::Unmodified;
+
+    // The parent must not be damaged by unrolling!
+#ifndef NDEBUG
+    if (Result != LoopUnrollResult::Unmodified && ParentL)
+      ParentL->verifyLoop();
+#endif
+
+    // Clear any cached analysis results for L if we removed it completely.
+    if (LAM && Result == LoopUnrollResult::FullyUnrolled)
+      LAM->clear(L, LoopName);
+  }
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  return getLoopPassPreservedAnalyses();
+}
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index d0c96fa627a4..bd468338a1d0 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -1,4 +1,4 @@
-//===-- LoopUnswitch.cpp - Hoist loop-invariant conditionals in loop ------===//
+//===- LoopUnswitch.cpp - Hoist loop-invariant conditionals in loop -------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -26,30 +26,40 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
-#include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/DivergenceAnalysis.h"
-#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Support/BranchProbability.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -58,9 +68,15 @@
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
+#include <cassert>
 #include <map>
 #include <set>
+#include <tuple>
+#include <utility>
+#include <vector>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "loop-unswitch"
@@ -82,11 +98,9 @@ Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"),
 namespace {
 
   class LUAnalysisCache {
-
-    typedef DenseMap<const SwitchInst*, SmallPtrSet<const Value *, 8> >
-      UnswitchedValsMap;
-
-    typedef UnswitchedValsMap::iterator UnswitchedValsIt;
+    using UnswitchedValsMap =
+        DenseMap<const SwitchInst *, SmallPtrSet<const Value *, 8>>;
+    using UnswitchedValsIt = UnswitchedValsMap::iterator;
 
     struct LoopProperties {
       unsigned CanBeUnswitchedCount;
@@ -97,12 +111,12 @@ namespace {
 
     // Here we use std::map instead of DenseMap, since we need to keep valid
     // LoopProperties pointer for current loop for better performance.
-    typedef std::map<const Loop*, LoopProperties> LoopPropsMap;
-    typedef LoopPropsMap::iterator LoopPropsMapIt;
+    using LoopPropsMap = std::map<const Loop *, LoopProperties>;
+    using LoopPropsMapIt = LoopPropsMap::iterator;
 
     LoopPropsMap LoopsProperties;
-    UnswitchedValsMap *CurLoopInstructions;
-    LoopProperties *CurrentLoopProperties;
+    UnswitchedValsMap *CurLoopInstructions = nullptr;
+    LoopProperties *CurrentLoopProperties = nullptr;
 
     // A loop unswitching with an estimated cost above this threshold
     // is not performed. MaxSize is turned into unswitching quota for
@@ -121,9 +135,7 @@ namespace {
     unsigned MaxSize;
 
   public:
-    LUAnalysisCache()
-        : CurLoopInstructions(nullptr), CurrentLoopProperties(nullptr),
-          MaxSize(Threshold) {}
+    LUAnalysisCache() : MaxSize(Threshold) {}
 
     // Analyze loop. Check its size, calculate is it possible to unswitch
     // it. Returns true if we can unswitch this loop.
@@ -164,12 +176,12 @@ namespace {
     LUAnalysisCache BranchesInfo;
 
     bool OptimizeForSize;
-    bool redoLoop;
+    bool redoLoop = false;
 
-    Loop *currentLoop;
-    DominatorTree *DT;
-    BasicBlock *loopHeader;
-    BasicBlock *loopPreheader;
+    Loop *currentLoop = nullptr;
+    DominatorTree *DT = nullptr;
+    BasicBlock *loopHeader = nullptr;
+    BasicBlock *loopPreheader = nullptr;
 
     bool SanitizeMemory;
     LoopSafetyInfo SafetyInfo;
@@ -185,16 +197,17 @@ namespace {
 
   public:
     static char ID; // Pass ID, replacement for typeid
-    explicit LoopUnswitch(bool Os = false, bool hasBranchDivergence = false) :
-      LoopPass(ID), OptimizeForSize(Os), redoLoop(false),
-      currentLoop(nullptr), DT(nullptr), loopHeader(nullptr),
-      loopPreheader(nullptr), hasBranchDivergence(hasBranchDivergence) {
+
+    explicit LoopUnswitch(bool Os = false, bool hasBranchDivergence = false)
+        : LoopPass(ID), OptimizeForSize(Os),
+          hasBranchDivergence(hasBranchDivergence) {
         initializeLoopUnswitchPass(*PassRegistry::getPassRegistry());
-      }
+    }
 
     bool runOnLoop(Loop *L, LPPassManager &LPM) override;
     bool processCurrentLoop();
     bool isUnreachableDueToPreviousUnswitching(BasicBlock *);
+
     /// This transformation requires natural loop information & requires that
     /// loop preheaders be inserted into the CFG.
     ///
@@ -207,7 +220,6 @@ namespace {
     }
 
   private:
-
     void releaseMemory() override {
       BranchesInfo.forgetLoop(currentLoop);
     }
@@ -237,7 +249,7 @@ namespace {
     void EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
                                         BasicBlock *TrueDest,
                                         BasicBlock *FalseDest,
-                                        Instruction *InsertPt,
+                                        BranchInst *OldBranch,
                                         TerminatorInst *TI);
 
     void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L);
@@ -247,13 +259,13 @@ namespace {
     Value *SimplifyInstructionWithNotEqual(Instruction *Inst, Value *Invariant,
                                            Constant *Val);
   };
-}
+
+} // end anonymous namespace
 
 // Analyze loop. Check its size, calculate is it possible to unswitch
 // it. Returns true if we can unswitch this loop.
 bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI,
                                 AssumptionCache *AC) {
-
   LoopPropsMapIt PropsIt;
   bool Inserted;
   std::tie(PropsIt, Inserted) =
@@ -302,7 +314,6 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI,
 
 // Clean all data related to given loop.
 void LUAnalysisCache::forgetLoop(const Loop *L) {
-
   LoopPropsMapIt LIt = LoopsProperties.find(L);
 
   if (LIt != LoopsProperties.end()) {
@@ -337,7 +348,6 @@ bool LUAnalysisCache::CostAllowsUnswitching() {
 // Note, that new loop data is stored inside the VMap.
 void LUAnalysisCache::cloneData(const Loop *NewLoop, const Loop *OldLoop,
                                 const ValueToValueMapTy &VMap) {
-
   LoopProperties &NewLoopProps = LoopsProperties[NewLoop];
   LoopProperties &OldLoopProps = *CurrentLoopProperties;
   UnswitchedValsMap &Insts = OldLoopProps.UnswitchedVals;
@@ -367,6 +377,7 @@ void LUAnalysisCache::cloneData(const Loop *NewLoop, const Loop *OldLoop,
 }
 
 char LoopUnswitch::ID = 0;
+
 INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops",
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
@@ -518,9 +529,6 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
     Changed |= processCurrentLoop();
   } while(redoLoop);
 
-  // FIXME: Reconstruct dom info, because it is not preserved properly.
-  if (Changed)
-    DT->recalculate(*F);
   return Changed;
 }
 
@@ -553,6 +561,48 @@ bool LoopUnswitch::isUnreachableDueToPreviousUnswitching(BasicBlock *BB) {
   return false;
 }
 
+/// FIXME: Remove this workaround when freeze related patches are done.
+/// LoopUnswitch and Equality propagation in GVN have discrepancy about
+/// whether branch on undef/poison has undefine behavior. Here it is to
+/// rule out some common cases that we found such discrepancy already
+/// causing problems. Detail could be found in PR31652. Note if the
+/// func returns true, it is unsafe. But if it is false, it doesn't mean
+/// it is necessarily safe.
+static bool EqualityPropUnSafe(Value &LoopCond) {
+  ICmpInst *CI = dyn_cast<ICmpInst>(&LoopCond);
+  if (!CI || !CI->isEquality())
+    return false;
+
+  Value *LHS = CI->getOperand(0);
+  Value *RHS = CI->getOperand(1);
+  if (isa<UndefValue>(LHS) || isa<UndefValue>(RHS))
+    return true;
+
+  auto hasUndefInPHI = [](PHINode &PN) {
+    for (Value *Opd : PN.incoming_values()) {
+      if (isa<UndefValue>(Opd))
+        return true;
+    }
+    return false;
+  };
+  PHINode *LPHI = dyn_cast<PHINode>(LHS);
+  PHINode *RPHI = dyn_cast<PHINode>(RHS);
+  if ((LPHI && hasUndefInPHI(*LPHI)) || (RPHI && hasUndefInPHI(*RPHI)))
+    return true;
+
+  auto hasUndefInSelect = [](SelectInst &SI) {
+    if (isa<UndefValue>(SI.getTrueValue()) ||
+        isa<UndefValue>(SI.getFalseValue()))
+      return true;
+    return false;
+  };
+  SelectInst *LSI = dyn_cast<SelectInst>(LHS);
+  SelectInst *RSI = dyn_cast<SelectInst>(RHS);
+  if ((LSI && hasUndefInSelect(*LSI)) || (RSI && hasUndefInSelect(*RSI)))
+    return true;
+  return false;
+}
+
 /// Do actual work and unswitch loop if possible and profitable.
 bool LoopUnswitch::processCurrentLoop() {
   bool Changed = false;
@@ -666,7 +716,7 @@ bool LoopUnswitch::processCurrentLoop() {
         // unswitch on it if we desire.
         Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
                                                currentLoop, Changed).first;
-        if (LoopCond &&
+        if (LoopCond && !EqualityPropUnSafe(*LoopCond) &&
             UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context), TI)) {
           ++NumBranches;
           return true;
@@ -831,7 +881,7 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val,
 /// mapping the blocks with the specified map.
 static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
                        LoopInfo *LI, LPPassManager *LPM) {
-  Loop &New = *new Loop();
+  Loop &New = *LI->AllocateLoop();
   if (PL)
     PL->addChildLoop(&New);
   else
@@ -852,31 +902,59 @@ static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
 }
 
 /// Emit a conditional branch on two values if LIC == Val, branch to TrueDst,
-/// otherwise branch to FalseDest. Insert the code immediately before InsertPt.
+/// otherwise branch to FalseDest. Insert the code immediately before OldBranch
+/// and remove (but not erase!) it from the function.
 void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
                                                   BasicBlock *TrueDest,
                                                   BasicBlock *FalseDest,
-                                                  Instruction *InsertPt,
+                                                  BranchInst *OldBranch,
                                                   TerminatorInst *TI) {
+  assert(OldBranch->isUnconditional() && "Preheader is not split correctly");
   // Insert a conditional branch on LIC to the two preheaders.  The original
   // code is the true version and the new code is the false version.
   Value *BranchVal = LIC;
   bool Swapped = false;
   if (!isa<ConstantInt>(Val) ||
       Val->getType() != Type::getInt1Ty(LIC->getContext()))
-    BranchVal = new ICmpInst(InsertPt, ICmpInst::ICMP_EQ, LIC, Val);
+    BranchVal = new ICmpInst(OldBranch, ICmpInst::ICMP_EQ, LIC, Val);
   else if (Val != ConstantInt::getTrue(Val->getContext())) {
     // We want to enter the new loop when the condition is true.
     std::swap(TrueDest, FalseDest);
     Swapped = true;
   }
 
+  // Old branch will be removed, so save its parent and successor to update the
+  // DomTree.
+  auto *OldBranchSucc = OldBranch->getSuccessor(0);
+  auto *OldBranchParent = OldBranch->getParent();
+
   // Insert the new branch.
   BranchInst *BI =
-      IRBuilder<>(InsertPt).CreateCondBr(BranchVal, TrueDest, FalseDest, TI);
+      IRBuilder<>(OldBranch).CreateCondBr(BranchVal, TrueDest, FalseDest, TI);
   if (Swapped)
     BI->swapProfMetadata();
 
+  // Remove the old branch so there is only one branch at the end. This is
+  // needed to perform DomTree's internal DFS walk on the function's CFG.
+  OldBranch->removeFromParent();
+
+  // Inform the DT about the new branch.
+  if (DT) {
+    // First, add both successors.
+    SmallVector<DominatorTree::UpdateType, 3> Updates;
+    if (TrueDest != OldBranchParent)
+      Updates.push_back({DominatorTree::Insert, OldBranchParent, TrueDest});
+    if (FalseDest != OldBranchParent)
+      Updates.push_back({DominatorTree::Insert, OldBranchParent, FalseDest});
+    // If both of the new successors are different from the old one, inform the
+    // DT that the edge was deleted.
+    if (OldBranchSucc != TrueDest && OldBranchSucc != FalseDest) {
+      Updates.push_back({DominatorTree::Delete, OldBranchParent, OldBranchSucc});
+    }
+
+    DT->applyUpdates(Updates);
+  }
+
   // If either edge is critical, split it. This helps preserve LoopSimplify
   // form for enclosing loops.
   auto Options = CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA();
@@ -916,10 +994,14 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
 
   // Okay, now we have a position to branch from and a position to branch to,
   // insert the new conditional branch.
-  EmitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH,
-                                 loopPreheader->getTerminator(), TI);
-  LPM->deleteSimpleAnalysisValue(loopPreheader->getTerminator(), L);
-  loopPreheader->getTerminator()->eraseFromParent();
+  auto *OldBranch = dyn_cast<BranchInst>(loopPreheader->getTerminator());
+  assert(OldBranch && "Failed to split the preheader");
+  EmitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH, OldBranch, TI);
+  LPM->deleteSimpleAnalysisValue(OldBranch, L);
+
+  // EmitPreheaderBranchOnCondition removed the OldBranch from the function.
+  // Delete it, as it is no longer needed.
+  delete OldBranch;
 
   // We need to reprocess this loop, it could be unswitched again.
   redoLoop = true;
@@ -1035,6 +1117,9 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
     if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
       return false;   // Can't handle this.
 
+    if (EqualityPropUnSafe(*LoopCond))
+      return false;
+
     UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB,
                              CurrentTerm);
     ++NumBranches;
@@ -1231,7 +1316,10 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
   EmitPreheaderBranchOnCondition(LIC, Val, NewBlocks[0], LoopBlocks[0], OldBR,
                                  TI);
   LPM->deleteSimpleAnalysisValue(OldBR, L);
-  OldBR->eraseFromParent();
+
+  // The OldBr was replaced by a new one and removed (but not erased) by
+  // EmitPreheaderBranchOnCondition. It is no longer needed, so delete it.
+  delete OldBR;
 
   LoopProcessWorklist.push_back(NewLoop);
   redoLoop = true;
diff --git a/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index c23d891b6504..53b25e688e82 100644
--- a/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -1,4 +1,4 @@
-//===----------- LoopVersioningLICM.cpp - LICM Loop Versioning ------------===//
+//===- LoopVersioningLICM.cpp - LICM Loop Versioning ----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -60,41 +60,41 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
-#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/PredIteratorCache.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/LoopVersioning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <cassert>
+#include <memory>
+
+using namespace llvm;
 
 #define DEBUG_TYPE "loop-versioning-licm"
-static const char *LICMVersioningMetaData = "llvm.loop.licm_versioning.disable";
 
-using namespace llvm;
+static const char *LICMVersioningMetaData = "llvm.loop.licm_versioning.disable";
 
 /// Threshold minimum allowed percentage for possible
 /// invariant instructions in a loop.
@@ -143,9 +143,16 @@ void llvm::addStringMetadataToLoop(Loop *TheLoop, const char *MDString,
 }
 
 namespace {
+
 struct LoopVersioningLICM : public LoopPass {
   static char ID;
 
+  LoopVersioningLICM()
+      : LoopPass(ID), LoopDepthThreshold(LVLoopDepthThreshold),
+        InvariantThreshold(LVInvarThreshold) {
+    initializeLoopVersioningLICMPass(*PassRegistry::getPassRegistry());
+  }
+
   bool runOnLoop(Loop *L, LPPassManager &LPM) override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -161,13 +168,6 @@ struct LoopVersioningLICM : public LoopPass {
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
 
-  LoopVersioningLICM()
-      : LoopPass(ID), AA(nullptr), SE(nullptr), LAA(nullptr), LAI(nullptr),
-        CurLoop(nullptr), LoopDepthThreshold(LVLoopDepthThreshold),
-        InvariantThreshold(LVInvarThreshold), LoadAndStoreCounter(0),
-        InvariantCounter(0), IsReadOnlyLoop(true) {
-    initializeLoopVersioningLICMPass(*PassRegistry::getPassRegistry());
-  }
   StringRef getPassName() const override { return "Loop Versioning for LICM"; }
 
   void reset() {
@@ -191,30 +191,49 @@ struct LoopVersioningLICM : public LoopPass {
   };
 
 private:
-  AliasAnalysis *AA;             // Current AliasAnalysis information
-  ScalarEvolution *SE;           // Current ScalarEvolution
-  LoopAccessLegacyAnalysis *LAA; // Current LoopAccessAnalysis
-  const LoopAccessInfo *LAI;     // Current Loop's LoopAccessInfo
+  // Current AliasAnalysis information
+  AliasAnalysis *AA = nullptr;
+
+  // Current ScalarEvolution
+  ScalarEvolution *SE = nullptr;
+
+  // Current LoopAccessAnalysis
+  LoopAccessLegacyAnalysis *LAA = nullptr;
+
+  // Current Loop's LoopAccessInfo
+  const LoopAccessInfo *LAI = nullptr;
+
+  // The current loop we are working on.
+  Loop *CurLoop = nullptr;
+
+  // AliasSet information for the current loop.
+  std::unique_ptr<AliasSetTracker> CurAST; 
 
-  Loop *CurLoop; // The current loop we are working on.
-  std::unique_ptr<AliasSetTracker>
-      CurAST; // AliasSet information for the current loop.
+  // Maximum loop nest threshold
+  unsigned LoopDepthThreshold;
 
-  unsigned LoopDepthThreshold;  // Maximum loop nest threshold
-  float InvariantThreshold;     // Minimum invariant threshold
-  unsigned LoadAndStoreCounter; // Counter to track num of load & store
-  unsigned InvariantCounter;    // Counter to track num of invariant
-  bool IsReadOnlyLoop;          // Read only loop marker.
+  // Minimum invariant threshold
+  float InvariantThreshold;
+
+  // Counter to track num of load & store
+  unsigned LoadAndStoreCounter = 0;
+
+  // Counter to track num of invariant
+  unsigned InvariantCounter = 0;
+
+  // Read only loop marker.
+  bool IsReadOnlyLoop = true;
 
   bool isLegalForVersioning();
   bool legalLoopStructure();
   bool legalLoopInstructions();
   bool legalLoopMemoryAccesses();
   bool isLoopAlreadyVisited();
-  void setNoAliasToLoop(Loop *);
-  bool instructionSafeForVersioning(Instruction *);
+  void setNoAliasToLoop(Loop *VerLoop);
+  bool instructionSafeForVersioning(Instruction *I);
 };
-}
+
+} // end anonymous namespace
 
 /// \brief Check loop structure and confirms it's good for LoopVersioningLICM.
 bool LoopVersioningLICM::legalLoopStructure() {
@@ -225,7 +244,7 @@ bool LoopVersioningLICM::legalLoopStructure() {
     return false;
   }
   // Loop should be innermost loop, if not return false.
-  if (CurLoop->getSubLoops().size()) {
+  if (!CurLoop->getSubLoops().empty()) {
     DEBUG(dbgs() << "    loop is not innermost\n");
     return false;
   }
@@ -562,6 +581,7 @@ bool LoopVersioningLICM::runOnLoop(Loop *L, LPPassManager &LPM) {
 }
 
 char LoopVersioningLICM::ID = 0;
+
 INITIALIZE_PASS_BEGIN(LoopVersioningLICM, "loop-versioning-licm",
                       "Loop Versioning For LICM", false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
diff --git a/lib/Transforms/Scalar/LowerAtomic.cpp b/lib/Transforms/Scalar/LowerAtomic.cpp
index 6f77c5bd0d07..c165c5ece95c 100644
--- a/lib/Transforms/Scalar/LowerAtomic.cpp
+++ b/lib/Transforms/Scalar/LowerAtomic.cpp
@@ -15,7 +15,6 @@
 #include "llvm/Transforms/Scalar/LowerAtomic.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Scalar.h"
 using namespace llvm;
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 7896396f0898..9c870b42a747 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -14,10 +14,12 @@
 
 #include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
@@ -25,6 +27,8 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -41,6 +45,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
@@ -54,6 +59,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
+#include <utility>
 
 using namespace llvm;
 
@@ -225,15 +231,18 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
 namespace {
 
 class MemsetRanges {
+  using range_iterator = SmallVectorImpl<MemsetRange>::iterator;
+
   /// A sorted list of the memset ranges.
   SmallVector<MemsetRange, 8> Ranges;
-  typedef SmallVectorImpl<MemsetRange>::iterator range_iterator;
+
   const DataLayout &DL;
 
 public:
   MemsetRanges(const DataLayout &DL) : DL(DL) {}
 
-  typedef SmallVectorImpl<MemsetRange>::const_iterator const_iterator;
+  using const_iterator = SmallVectorImpl<MemsetRange>::const_iterator;
+
   const_iterator begin() const { return Ranges.begin(); }
   const_iterator end() const { return Ranges.end(); }
   bool empty() const { return Ranges.empty(); }
@@ -259,7 +268,6 @@ public:
 
   void addRange(int64_t Start, int64_t Size, Value *Ptr,
                 unsigned Alignment, Instruction *Inst);
-
 };
 
 } // end anonymous namespace
@@ -356,10 +364,10 @@ private:
   }
 };
 
-char MemCpyOptLegacyPass::ID = 0;
-
 } // end anonymous namespace
 
+char MemCpyOptLegacyPass::ID = 0;
+
 /// The public interface to this file...
 FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOptLegacyPass(); }
 
@@ -450,7 +458,6 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
   // emit memset's for anything big enough to be worthwhile.
   Instruction *AMemSet = nullptr;
   for (const MemsetRange &Range : Ranges) {
-
     if (Range.TheStores.size() == 1) continue;
 
     // If it is profitable to lower this range to memset, do so now.
@@ -511,7 +518,7 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
                    const LoadInst *LI) {
   // If the store alias this position, early bail out.
   MemoryLocation StoreLoc = MemoryLocation::get(SI);
-  if (AA.getModRefInfo(P, StoreLoc) != MRI_NoModRef)
+  if (isModOrRefSet(AA.getModRefInfo(P, StoreLoc)))
     return false;
 
   // Keep track of the arguments of all instruction we plan to lift
@@ -535,20 +542,20 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
   for (auto I = --SI->getIterator(), E = P->getIterator(); I != E; --I) {
     auto *C = &*I;
 
-    bool MayAlias = AA.getModRefInfo(C) != MRI_NoModRef;
+    bool MayAlias = isModOrRefSet(AA.getModRefInfo(C, None));
 
     bool NeedLift = false;
     if (Args.erase(C))
       NeedLift = true;
     else if (MayAlias) {
       NeedLift = llvm::any_of(MemLocs, [C, &AA](const MemoryLocation &ML) {
-        return AA.getModRefInfo(C, ML);
+        return isModOrRefSet(AA.getModRefInfo(C, ML));
       });
 
       if (!NeedLift)
         NeedLift =
             llvm::any_of(CallSites, [C, &AA](const ImmutableCallSite &CS) {
-              return AA.getModRefInfo(C, CS);
+              return isModOrRefSet(AA.getModRefInfo(C, CS));
             });
     }
 
@@ -558,18 +565,18 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
     if (MayAlias) {
       // Since LI is implicitly moved downwards past the lifted instructions,
       // none of them may modify its source.
-      if (AA.getModRefInfo(C, LoadLoc) & MRI_Mod)
+      if (isModSet(AA.getModRefInfo(C, LoadLoc)))
         return false;
       else if (auto CS = ImmutableCallSite(C)) {
         // If we can't lift this before P, it's game over.
-        if (AA.getModRefInfo(P, CS) != MRI_NoModRef)
+        if (isModOrRefSet(AA.getModRefInfo(P, CS)))
           return false;
 
         CallSites.push_back(CS);
       } else if (isa<LoadInst>(C) || isa<StoreInst>(C) || isa<VAArgInst>(C)) {
         // If we can't lift this before P, it's game over.
         auto ML = MemoryLocation::get(C);
-        if (AA.getModRefInfo(P, ML) != MRI_NoModRef)
+        if (isModOrRefSet(AA.getModRefInfo(P, ML)))
           return false;
 
         MemLocs.push_back(ML);
@@ -624,7 +631,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
         // of at the store position.
         Instruction *P = SI;
         for (auto &I : make_range(++LI->getIterator(), SI->getIterator())) {
-          if (AA.getModRefInfo(&I, LoadLoc) & MRI_Mod) {
+          if (isModSet(AA.getModRefInfo(&I, LoadLoc))) {
             P = &I;
             break;
           }
@@ -695,7 +702,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
         MemoryLocation StoreLoc = MemoryLocation::get(SI);
         for (BasicBlock::iterator I = --SI->getIterator(), E = C->getIterator();
              I != E; --I) {
-          if (AA.getModRefInfo(&*I, StoreLoc) != MRI_NoModRef) {
+          if (isModOrRefSet(AA.getModRefInfo(&*I, StoreLoc))) {
             C = nullptr;
             break;
           }
@@ -927,9 +934,9 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
   AliasAnalysis &AA = LookupAliasAnalysis();
   ModRefInfo MR = AA.getModRefInfo(C, cpyDest, srcSize);
   // If necessary, perform additional analysis.
-  if (MR != MRI_NoModRef)
+  if (isModOrRefSet(MR))
     MR = AA.callCapturesBefore(C, cpyDest, srcSize, &DT);
-  if (MR != MRI_NoModRef)
+  if (isModOrRefSet(MR))
     return false;
 
   // We can't create address space casts here because we don't know if they're
diff --git a/lib/Transforms/Scalar/MergeICmps.cpp b/lib/Transforms/Scalar/MergeICmps.cpp
new file mode 100644
index 000000000000..9869a3fb96fa
--- /dev/null
+++ b/lib/Transforms/Scalar/MergeICmps.cpp
@@ -0,0 +1,650 @@
+//===- MergeICmps.cpp - Optimize chains of integer comparisons ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass turns chains of integer comparisons into memcmp (the memcmp is
+// later typically inlined as a chain of efficient hardware comparisons). This
+// typically benefits c++ member or nonmember operator==().
+//
+// The basic idea is to replace a larger chain of integer comparisons loaded
+// from contiguous memory locations into a smaller chain of such integer
+// comparisons. Benefits are double:
+//  - There are less jumps, and therefore less opportunities for mispredictions
+//    and I-cache misses.
+//  - Code size is smaller, both because jumps are removed and because the
+//    encoding of a 2*n byte compare is smaller than that of two n-byte
+//    compares.
+
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <numeric>
+#include <utility>
+#include <vector>
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+
+using namespace llvm;
+
+namespace {
+
+#define DEBUG_TYPE "mergeicmps"
+
+// A BCE atom.
+struct BCEAtom {
+  BCEAtom() : GEP(nullptr), LoadI(nullptr), Offset() {}
+
+  const Value *Base() const { return GEP ? GEP->getPointerOperand() : nullptr; }
+
+  bool operator<(const BCEAtom &O) const {
+    assert(Base() && "invalid atom");
+    assert(O.Base() && "invalid atom");
+    // Just ordering by (Base(), Offset) is sufficient. However because this
+    // means that the ordering will depend on the addresses of the base
+    // values, which are not reproducible from run to run. To guarantee
+    // stability, we use the names of the values if they exist; we sort by:
+    // (Base.getName(), Base(), Offset).
+    const int NameCmp = Base()->getName().compare(O.Base()->getName());
+    if (NameCmp == 0) {
+      if (Base() == O.Base()) {
+        return Offset.slt(O.Offset);
+      }
+      return Base() < O.Base();
+    }
+    return NameCmp < 0;
+  }
+
+  GetElementPtrInst *GEP;
+  LoadInst *LoadI;
+  APInt Offset;
+};
+
+// If this value is a load from a constant offset w.r.t. a base address, and
+// there are no othe rusers of the load or address, returns the base address and
+// the offset.
+BCEAtom visitICmpLoadOperand(Value *const Val) {
+  BCEAtom Result;
+  if (auto *const LoadI = dyn_cast<LoadInst>(Val)) {
+    DEBUG(dbgs() << "load\n");
+    if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) {
+      DEBUG(dbgs() << "used outside of block\n");
+      return {};
+    }
+    if (LoadI->isVolatile()) {
+      DEBUG(dbgs() << "volatile\n");
+      return {};
+    }
+    Value *const Addr = LoadI->getOperand(0);
+    if (auto *const GEP = dyn_cast<GetElementPtrInst>(Addr)) {
+      DEBUG(dbgs() << "GEP\n");
+      if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) {
+        DEBUG(dbgs() << "used outside of block\n");
+        return {};
+      }
+      const auto &DL = GEP->getModule()->getDataLayout();
+      if (!isDereferenceablePointer(GEP, DL)) {
+        DEBUG(dbgs() << "not dereferenceable\n");
+        // We need to make sure that we can do comparison in any order, so we
+        // require memory to be unconditionnally dereferencable.
+        return {};
+      }
+      Result.Offset = APInt(DL.getPointerTypeSizeInBits(GEP->getType()), 0);
+      if (GEP->accumulateConstantOffset(DL, Result.Offset)) {
+        Result.GEP = GEP;
+        Result.LoadI = LoadI;
+      }
+    }
+  }
+  return Result;
+}
+
+// A basic block with a comparison between two BCE atoms.
+// Note: the terminology is misleading: the comparison is symmetric, so there
+// is no real {l/r}hs. What we want though is to have the same base on the
+// left (resp. right), so that we can detect consecutive loads. To ensure this
+// we put the smallest atom on the left.
+class BCECmpBlock {
+ public:
+  BCECmpBlock() {}
+
+  BCECmpBlock(BCEAtom L, BCEAtom R, int SizeBits)
+      : Lhs_(L), Rhs_(R), SizeBits_(SizeBits) {
+    if (Rhs_ < Lhs_) std::swap(Rhs_, Lhs_);
+  }
+
+  bool IsValid() const {
+    return Lhs_.Base() != nullptr && Rhs_.Base() != nullptr;
+  }
+
+  // Assert the the block is consistent: If valid, it should also have
+  // non-null members besides Lhs_ and Rhs_.
+  void AssertConsistent() const {
+    if (IsValid()) {
+      assert(BB);
+      assert(CmpI);
+      assert(BranchI);
+    }
+  }
+
+  const BCEAtom &Lhs() const { return Lhs_; }
+  const BCEAtom &Rhs() const { return Rhs_; }
+  int SizeBits() const { return SizeBits_; }
+
+  // Returns true if the block does other works besides comparison.
+  bool doesOtherWork() const;
+
+  // The basic block where this comparison happens.
+  BasicBlock *BB = nullptr;
+  // The ICMP for this comparison.
+  ICmpInst *CmpI = nullptr;
+  // The terminating branch.
+  BranchInst *BranchI = nullptr;
+
+ private:
+  BCEAtom Lhs_;
+  BCEAtom Rhs_;
+  int SizeBits_ = 0;
+};
+
+bool BCECmpBlock::doesOtherWork() const {
+  AssertConsistent();
+  // TODO(courbet): Can we allow some other things ? This is very conservative.
+  // We might be able to get away with anything does does not have any side
+  // effects outside of the basic block.
+  // Note: The GEPs and/or loads are not necessarily in the same block.
+  for (const Instruction &Inst : *BB) {
+    if (const auto *const GEP = dyn_cast<GetElementPtrInst>(&Inst)) {
+      if (!(Lhs_.GEP == GEP || Rhs_.GEP == GEP)) return true;
+    } else if (const auto *const L = dyn_cast<LoadInst>(&Inst)) {
+      if (!(Lhs_.LoadI == L || Rhs_.LoadI == L)) return true;
+    } else if (const auto *const C = dyn_cast<ICmpInst>(&Inst)) {
+      if (C != CmpI) return true;
+    } else if (const auto *const Br = dyn_cast<BranchInst>(&Inst)) {
+      if (Br != BranchI) return true;
+    } else {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Visit the given comparison. If this is a comparison between two valid
+// BCE atoms, returns the comparison.
+BCECmpBlock visitICmp(const ICmpInst *const CmpI,
+                      const ICmpInst::Predicate ExpectedPredicate) {
+  if (CmpI->getPredicate() == ExpectedPredicate) {
+    DEBUG(dbgs() << "cmp "
+                 << (ExpectedPredicate == ICmpInst::ICMP_EQ ? "eq" : "ne")
+                 << "\n");
+    auto Lhs = visitICmpLoadOperand(CmpI->getOperand(0));
+    if (!Lhs.Base()) return {};
+    auto Rhs = visitICmpLoadOperand(CmpI->getOperand(1));
+    if (!Rhs.Base()) return {};
+    return BCECmpBlock(std::move(Lhs), std::move(Rhs),
+                       CmpI->getOperand(0)->getType()->getScalarSizeInBits());
+  }
+  return {};
+}
+
+// Visit the given comparison block. If this is a comparison between two valid
+// BCE atoms, returns the comparison.
+BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
+                          const BasicBlock *const PhiBlock) {
+  if (Block->empty()) return {};
+  auto *const BranchI = dyn_cast<BranchInst>(Block->getTerminator());
+  if (!BranchI) return {};
+  DEBUG(dbgs() << "branch\n");
+  if (BranchI->isUnconditional()) {
+    // In this case, we expect an incoming value which is the result of the
+    // comparison. This is the last link in the chain of comparisons (note
+    // that this does not mean that this is the last incoming value, blocks
+    // can be reordered).
+    auto *const CmpI = dyn_cast<ICmpInst>(Val);
+    if (!CmpI) return {};
+    DEBUG(dbgs() << "icmp\n");
+    auto Result = visitICmp(CmpI, ICmpInst::ICMP_EQ);
+    Result.CmpI = CmpI;
+    Result.BranchI = BranchI;
+    return Result;
+  } else {
+    // In this case, we expect a constant incoming value (the comparison is
+    // chained).
+    const auto *const Const = dyn_cast<ConstantInt>(Val);
+    DEBUG(dbgs() << "const\n");
+    if (!Const->isZero()) return {};
+    DEBUG(dbgs() << "false\n");
+    auto *const CmpI = dyn_cast<ICmpInst>(BranchI->getCondition());
+    if (!CmpI) return {};
+    DEBUG(dbgs() << "icmp\n");
+    assert(BranchI->getNumSuccessors() == 2 && "expecting a cond branch");
+    BasicBlock *const FalseBlock = BranchI->getSuccessor(1);
+    auto Result = visitICmp(
+        CmpI, FalseBlock == PhiBlock ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE);
+    Result.CmpI = CmpI;
+    Result.BranchI = BranchI;
+    return Result;
+  }
+  return {};
+}
+
+// A chain of comparisons.
+class BCECmpChain {
+ public:
+  BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi);
+
+  int size() const { return Comparisons_.size(); }
+
+#ifdef MERGEICMPS_DOT_ON
+  void dump() const;
+#endif  // MERGEICMPS_DOT_ON
+
+  bool simplify(const TargetLibraryInfo *const TLI);
+
+ private:
+  static bool IsContiguous(const BCECmpBlock &First,
+                           const BCECmpBlock &Second) {
+    return First.Lhs().Base() == Second.Lhs().Base() &&
+           First.Rhs().Base() == Second.Rhs().Base() &&
+           First.Lhs().Offset + First.SizeBits() / 8 == Second.Lhs().Offset &&
+           First.Rhs().Offset + First.SizeBits() / 8 == Second.Rhs().Offset;
+  }
+
+  // Merges the given comparison blocks into one memcmp block and update
+  // branches. Comparisons are assumed to be continguous. If NextBBInChain is
+  // null, the merged block will link to the phi block.
+  static void mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
+                               BasicBlock *const NextBBInChain, PHINode &Phi,
+                               const TargetLibraryInfo *const TLI);
+
+  PHINode &Phi_;
+  std::vector<BCECmpBlock> Comparisons_;
+  // The original entry block (before sorting);
+  BasicBlock *EntryBlock_;
+};
+
+BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi)
+    : Phi_(Phi) {
+  // Now look inside blocks to check for BCE comparisons.
+  std::vector<BCECmpBlock> Comparisons;
+  for (BasicBlock *Block : Blocks) {
+    BCECmpBlock Comparison = visitCmpBlock(Phi.getIncomingValueForBlock(Block),
+                                           Block, Phi.getParent());
+    Comparison.BB = Block;
+    if (!Comparison.IsValid()) {
+      DEBUG(dbgs() << "skip: not a valid BCECmpBlock\n");
+      return;
+    }
+    if (Comparison.doesOtherWork()) {
+      DEBUG(dbgs() << "block does extra work besides compare\n");
+      if (Comparisons.empty()) {  // First block.
+        // TODO(courbet): The first block can do other things, and we should
+        // split them apart in a separate block before the comparison chain.
+        // Right now we just discard it and make the chain shorter.
+        DEBUG(dbgs()
+              << "ignoring first block that does extra work besides compare\n");
+        continue;
+      }
+      // TODO(courbet): Right now we abort the whole chain. We could be
+      // merging only the blocks that don't do other work and resume the
+      // chain from there. For example:
+      //  if (a[0] == b[0]) {  // bb1
+      //    if (a[1] == b[1]) {  // bb2
+      //      some_value = 3; //bb3
+      //      if (a[2] == b[2]) { //bb3
+      //        do a ton of stuff  //bb4
+      //      }
+      //    }
+      //  }
+      //
+      // This is:
+      //
+      // bb1 --eq--> bb2 --eq--> bb3* -eq--> bb4 --+
+      //  \            \           \               \
+      //   ne           ne          ne              \
+      //    \            \           \               v
+      //     +------------+-----------+----------> bb_phi
+      //
+      // We can only merge the first two comparisons, because bb3* does
+      // "other work" (setting some_value to 3).
+      // We could still merge bb1 and bb2 though.
+      return;
+    }
+    DEBUG(dbgs() << "*Found cmp of " << Comparison.SizeBits()
+                 << " bits between " << Comparison.Lhs().Base() << " + "
+                 << Comparison.Lhs().Offset << " and "
+                 << Comparison.Rhs().Base() << " + " << Comparison.Rhs().Offset
+                 << "\n");
+    DEBUG(dbgs() << "\n");
+    Comparisons.push_back(Comparison);
+  }
+  EntryBlock_ = Comparisons[0].BB;
+  Comparisons_ = std::move(Comparisons);
+#ifdef MERGEICMPS_DOT_ON
+  errs() << "BEFORE REORDERING:\n\n";
+  dump();
+#endif  // MERGEICMPS_DOT_ON
+  // Reorder blocks by LHS. We can do that without changing the
+  // semantics because we are only accessing dereferencable memory.
+  std::sort(Comparisons_.begin(), Comparisons_.end(),
+            [](const BCECmpBlock &a, const BCECmpBlock &b) {
+              return a.Lhs() < b.Lhs();
+            });
+#ifdef MERGEICMPS_DOT_ON
+  errs() << "AFTER REORDERING:\n\n";
+  dump();
+#endif  // MERGEICMPS_DOT_ON
+}
+
+#ifdef MERGEICMPS_DOT_ON
+void BCECmpChain::dump() const {
+  errs() << "digraph dag {\n";
+  errs() << " graph [bgcolor=transparent];\n";
+  errs() << " node [color=black,style=filled,fillcolor=lightyellow];\n";
+  errs() << " edge [color=black];\n";
+  for (size_t I = 0; I < Comparisons_.size(); ++I) {
+    const auto &Comparison = Comparisons_[I];
+    errs() << " \"" << I << "\" [label=\"%"
+           << Comparison.Lhs().Base()->getName() << " + "
+           << Comparison.Lhs().Offset << " == %"
+           << Comparison.Rhs().Base()->getName() << " + "
+           << Comparison.Rhs().Offset << " (" << (Comparison.SizeBits() / 8)
+           << " bytes)\"];\n";
+    const Value *const Val = Phi_.getIncomingValueForBlock(Comparison.BB);
+    if (I > 0) errs() << " \"" << (I - 1) << "\" -> \"" << I << "\";\n";
+    errs() << " \"" << I << "\" -> \"Phi\" [label=\"" << *Val << "\"];\n";
+  }
+  errs() << " \"Phi\" [label=\"Phi\"];\n";
+  errs() << "}\n\n";
+}
+#endif  // MERGEICMPS_DOT_ON
+
+bool BCECmpChain::simplify(const TargetLibraryInfo *const TLI) {
+  // First pass to check if there is at least one merge. If not, we don't do
+  // anything and we keep analysis passes intact.
+  {
+    bool AtLeastOneMerged = false;
+    for (size_t I = 1; I < Comparisons_.size(); ++I) {
+      if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) {
+        AtLeastOneMerged = true;
+        break;
+      }
+    }
+    if (!AtLeastOneMerged) return false;
+  }
+
+  // Remove phi references to comparison blocks, they will be rebuilt as we
+  // merge the blocks.
+  for (const auto &Comparison : Comparisons_) {
+    Phi_.removeIncomingValue(Comparison.BB, false);
+  }
+
+  // Point the predecessors of the chain to the first comparison block (which is
+  // the new entry point).
+  if (EntryBlock_ != Comparisons_[0].BB)
+    EntryBlock_->replaceAllUsesWith(Comparisons_[0].BB);
+
+  // Effectively merge blocks.
+  int NumMerged = 1;
+  for (size_t I = 1; I < Comparisons_.size(); ++I) {
+    if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) {
+      ++NumMerged;
+    } else {
+      // Merge all previous comparisons and start a new merge block.
+      mergeComparisons(
+          makeArrayRef(Comparisons_).slice(I - NumMerged, NumMerged),
+          Comparisons_[I].BB, Phi_, TLI);
+      NumMerged = 1;
+    }
+  }
+  mergeComparisons(makeArrayRef(Comparisons_)
+                       .slice(Comparisons_.size() - NumMerged, NumMerged),
+                   nullptr, Phi_, TLI);
+
+  return true;
+}
+
+void BCECmpChain::mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
+                                   BasicBlock *const NextBBInChain,
+                                   PHINode &Phi,
+                                   const TargetLibraryInfo *const TLI) {
+  assert(!Comparisons.empty());
+  const auto &FirstComparison = *Comparisons.begin();
+  BasicBlock *const BB = FirstComparison.BB;
+  LLVMContext &Context = BB->getContext();
+
+  if (Comparisons.size() >= 2) {
+    DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons\n");
+    const auto TotalSize =
+        std::accumulate(Comparisons.begin(), Comparisons.end(), 0,
+                        [](int Size, const BCECmpBlock &C) {
+                          return Size + C.SizeBits();
+                        }) /
+        8;
+
+    // Incoming edges do not need to be updated, and both GEPs are already
+    // computing the right address, we just need to:
+    //   - replace the two loads and the icmp with the memcmp
+    //   - update the branch
+    //   - update the incoming values in the phi.
+    FirstComparison.BranchI->eraseFromParent();
+    FirstComparison.CmpI->eraseFromParent();
+    FirstComparison.Lhs().LoadI->eraseFromParent();
+    FirstComparison.Rhs().LoadI->eraseFromParent();
+
+    IRBuilder<> Builder(BB);
+    const auto &DL = Phi.getModule()->getDataLayout();
+    Value *const MemCmpCall = emitMemCmp(
+        FirstComparison.Lhs().GEP, FirstComparison.Rhs().GEP, ConstantInt::get(DL.getIntPtrType(Context), TotalSize),
+        Builder, DL, TLI);
+    Value *const MemCmpIsZero = Builder.CreateICmpEQ(
+        MemCmpCall, ConstantInt::get(Type::getInt32Ty(Context), 0));
+
+    // Add a branch to the next basic block in the chain.
+    if (NextBBInChain) {
+      Builder.CreateCondBr(MemCmpIsZero, NextBBInChain, Phi.getParent());
+      Phi.addIncoming(ConstantInt::getFalse(Context), BB);
+    } else {
+      Builder.CreateBr(Phi.getParent());
+      Phi.addIncoming(MemCmpIsZero, BB);
+    }
+
+    // Delete merged blocks.
+    for (size_t I = 1; I < Comparisons.size(); ++I) {
+      BasicBlock *CBB = Comparisons[I].BB;
+      CBB->replaceAllUsesWith(BB);
+      CBB->eraseFromParent();
+    }
+  } else {
+    assert(Comparisons.size() == 1);
+    // There are no blocks to merge, but we still need to update the branches.
+    DEBUG(dbgs() << "Only one comparison, updating branches\n");
+    if (NextBBInChain) {
+      if (FirstComparison.BranchI->isConditional()) {
+        DEBUG(dbgs() << "conditional -> conditional\n");
+        // Just update the "true" target, the "false" target should already be
+        // the phi block.
+        assert(FirstComparison.BranchI->getSuccessor(1) == Phi.getParent());
+        FirstComparison.BranchI->setSuccessor(0, NextBBInChain);
+        Phi.addIncoming(ConstantInt::getFalse(Context), BB);
+      } else {
+        DEBUG(dbgs() << "unconditional -> conditional\n");
+        // Replace the unconditional branch by a conditional one.
+        FirstComparison.BranchI->eraseFromParent();
+        IRBuilder<> Builder(BB);
+        Builder.CreateCondBr(FirstComparison.CmpI, NextBBInChain,
+                             Phi.getParent());
+        Phi.addIncoming(FirstComparison.CmpI, BB);
+      }
+    } else {
+      if (FirstComparison.BranchI->isConditional()) {
+        DEBUG(dbgs() << "conditional -> unconditional\n");
+        // Replace the conditional branch by an unconditional one.
+        FirstComparison.BranchI->eraseFromParent();
+        IRBuilder<> Builder(BB);
+        Builder.CreateBr(Phi.getParent());
+        Phi.addIncoming(FirstComparison.CmpI, BB);
+      } else {
+        DEBUG(dbgs() << "unconditional -> unconditional\n");
+        Phi.addIncoming(FirstComparison.CmpI, BB);
+      }
+    }
+  }
+}
+
+std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi,
+                                           BasicBlock *const LastBlock,
+                                           int NumBlocks) {
+  // Walk up from the last block to find other blocks.
+  std::vector<BasicBlock *> Blocks(NumBlocks);
+  BasicBlock *CurBlock = LastBlock;
+  for (int BlockIndex = NumBlocks - 1; BlockIndex > 0; --BlockIndex) {
+    if (CurBlock->hasAddressTaken()) {
+      // Somebody is jumping to the block through an address, all bets are
+      // off.
+      DEBUG(dbgs() << "skip: block " << BlockIndex
+                   << " has its address taken\n");
+      return {};
+    }
+    Blocks[BlockIndex] = CurBlock;
+    auto *SinglePredecessor = CurBlock->getSinglePredecessor();
+    if (!SinglePredecessor) {
+      // The block has two or more predecessors.
+      DEBUG(dbgs() << "skip: block " << BlockIndex
+                   << " has two or more predecessors\n");
+      return {};
+    }
+    if (Phi.getBasicBlockIndex(SinglePredecessor) < 0) {
+      // The block does not link back to the phi.
+      DEBUG(dbgs() << "skip: block " << BlockIndex
+                   << " does not link back to the phi\n");
+      return {};
+    }
+    CurBlock = SinglePredecessor;
+  }
+  Blocks[0] = CurBlock;
+  return Blocks;
+}
+
+bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI) {
+  DEBUG(dbgs() << "processPhi()\n");
+  if (Phi.getNumIncomingValues() <= 1) {
+    DEBUG(dbgs() << "skip: only one incoming value in phi\n");
+    return false;
+  }
+  // We are looking for something that has the following structure:
+  //   bb1 --eq--> bb2 --eq--> bb3 --eq--> bb4 --+
+  //     \            \           \               \
+  //      ne           ne          ne              \
+  //       \            \           \               v
+  //        +------------+-----------+----------> bb_phi
+  //
+  //  - The last basic block (bb4 here) must branch unconditionally to bb_phi.
+  //    It's the only block that contributes a non-constant value to the Phi.
+  //  - All other blocks (b1, b2, b3) must have exactly two successors, one of
+  //    them being the the phi block.
+  //  - All intermediate blocks (bb2, bb3) must have only one predecessor.
+  //  - Blocks cannot do other work besides the comparison, see doesOtherWork()
+
+  // The blocks are not necessarily ordered in the phi, so we start from the
+  // last block and reconstruct the order.
+  BasicBlock *LastBlock = nullptr;
+  for (unsigned I = 0; I < Phi.getNumIncomingValues(); ++I) {
+    if (isa<ConstantInt>(Phi.getIncomingValue(I))) continue;
+    if (LastBlock) {
+      // There are several non-constant values.
+      DEBUG(dbgs() << "skip: several non-constant values\n");
+      return false;
+    }
+    LastBlock = Phi.getIncomingBlock(I);
+  }
+  if (!LastBlock) {
+    // There is no non-constant block.
+    DEBUG(dbgs() << "skip: no non-constant block\n");
+    return false;
+  }
+  if (LastBlock->getSingleSuccessor() != Phi.getParent()) {
+    DEBUG(dbgs() << "skip: last block non-phi successor\n");
+    return false;
+  }
+
+  const auto Blocks =
+      getOrderedBlocks(Phi, LastBlock, Phi.getNumIncomingValues());
+  if (Blocks.empty()) return false;
+  BCECmpChain CmpChain(Blocks, Phi);
+
+  if (CmpChain.size() < 2) {
+    DEBUG(dbgs() << "skip: only one compare block\n");
+    return false;
+  }
+
+  return CmpChain.simplify(TLI);
+}
+
+class MergeICmps : public FunctionPass {
+ public:
+  static char ID;
+
+  MergeICmps() : FunctionPass(ID) {
+    initializeMergeICmpsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F)) return false;
+    const auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto PA = runImpl(F, &TLI, &TTI);
+    return !PA.areAllPreserved();
+  }
+
+ private:
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+
+  PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,
+                            const TargetTransformInfo *TTI);
+};
+
+PreservedAnalyses MergeICmps::runImpl(Function &F, const TargetLibraryInfo *TLI,
+                                      const TargetTransformInfo *TTI) {
+  DEBUG(dbgs() << "MergeICmpsPass: " << F.getName() << "\n");
+
+  // We only try merging comparisons if the target wants to expand memcmp later.
+  // The rationale is to avoid turning small chains into memcmp calls.
+  if (!TTI->enableMemCmpExpansion(true)) return PreservedAnalyses::all();
+
+  bool MadeChange = false;
+
+  for (auto BBIt = ++F.begin(); BBIt != F.end(); ++BBIt) {
+    // A Phi operation is always first in a basic block.
+    if (auto *const Phi = dyn_cast<PHINode>(&*BBIt->begin()))
+      MadeChange |= processPhi(*Phi, TLI);
+  }
+
+  if (MadeChange) return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+}  // namespace
+
+char MergeICmps::ID = 0;
+INITIALIZE_PASS_BEGIN(MergeICmps, "mergeicmps",
+                      "Merge contiguous icmps into a memcmp", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(MergeICmps, "mergeicmps",
+                    "Merge contiguous icmps into a memcmp", false, false)
+
+Pass *llvm::createMergeICmpsPass() { return new MergeICmps(); }
diff --git a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index 6727cf0179c1..f2f615cb9b0f 100644
--- a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -80,11 +80,9 @@
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Metadata.h"
-#include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
@@ -195,7 +193,7 @@ bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction &Start,
        make_range(Start.getIterator(), End.getIterator()))
     if (Inst.mayThrow())
       return true;
-  return AA->canInstructionRangeModRef(Start, End, Loc, MRI_ModRef);
+  return AA->canInstructionRangeModRef(Start, End, Loc, ModRefInfo::ModRef);
 }
 
 ///
diff --git a/lib/Transforms/Scalar/NaryReassociate.cpp b/lib/Transforms/Scalar/NaryReassociate.cpp
index d0bfe3603897..b026c8d692c3 100644
--- a/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -77,19 +77,45 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/NaryReassociate.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <cstdint>
+
 using namespace llvm;
 using namespace PatternMatch;
 
 #define DEBUG_TYPE "nary-reassociate"
 
 namespace {
+
 class NaryReassociateLegacyPass : public FunctionPass {
 public:
   static char ID;
@@ -101,6 +127,7 @@ public:
   bool doInitialization(Module &M) override {
     return false;
   }
+
   bool runOnFunction(Function &F) override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -118,9 +145,11 @@ public:
 private:
   NaryReassociatePass Impl;
 };
-} // anonymous namespace
+
+} // end anonymous namespace
 
 char NaryReassociateLegacyPass::ID = 0;
+
 INITIALIZE_PASS_BEGIN(NaryReassociateLegacyPass, "nary-reassociate",
                       "Nary reassociation", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index 8ac10348eb77..9ebf2d769356 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -1,4 +1,4 @@
-//===---- NewGVN.cpp - Global Value Numbering Pass --------------*- C++ -*-===//
+//===- NewGVN.cpp - Global Value Numbering Pass ---------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,6 +6,7 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
+//
 /// \file
 /// This file implements the new LLVM's Global Value Numbering pass.
 /// GVN partitions values computed by a function into congruence classes.
@@ -48,59 +49,81 @@
 /// published algorithms are O(Instructions). Instead, we use a technique that
 /// is O(number of operations with the same value number), enabling us to skip
 /// trying to eliminate things that have unique value numbers.
+//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/NewGVN.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseBitVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/CFGPrinter.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/ArrayRecycler.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/DebugCounter.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/PointerLikeTypeTraits.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVNExpression.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PredicateInfo.h"
 #include "llvm/Transforms/Utils/VNCoercion.h"
-#include <numeric>
-#include <unordered_map>
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <tuple>
 #include <utility>
 #include <vector>
+
 using namespace llvm;
-using namespace PatternMatch;
 using namespace llvm::GVNExpression;
 using namespace llvm::VNCoercion;
+
 #define DEBUG_TYPE "newgvn"
 
 STATISTIC(NumGVNInstrDeleted, "Number of instructions deleted");
@@ -118,15 +141,19 @@ STATISTIC(NumGVNPHIOfOpsCreated, "Number of PHI of ops created");
 STATISTIC(NumGVNPHIOfOpsEliminations,
           "Number of things eliminated using PHI of ops");
 DEBUG_COUNTER(VNCounter, "newgvn-vn",
-              "Controls which instructions are value numbered")
+              "Controls which instructions are value numbered");
 DEBUG_COUNTER(PHIOfOpsCounter, "newgvn-phi",
-              "Controls which instructions we create phi of ops for")
+              "Controls which instructions we create phi of ops for");
 // Currently store defining access refinement is too slow due to basicaa being
 // egregiously slow.  This flag lets us keep it working while we work on this
 // issue.
 static cl::opt<bool> EnableStoreRefinement("enable-store-refinement",
                                            cl::init(false), cl::Hidden);
 
+/// Currently, the generation "phi of ops" can result in correctness issues.
+static cl::opt<bool> EnablePhiOfOps("enable-phi-of-ops", cl::init(true),
+                                    cl::Hidden);
+
 //===----------------------------------------------------------------------===//
 //                                GVN Pass
 //===----------------------------------------------------------------------===//
@@ -134,6 +161,7 @@ static cl::opt<bool> EnableStoreRefinement("enable-store-refinement",
 // Anchor methods.
 namespace llvm {
 namespace GVNExpression {
+
 Expression::~Expression() = default;
 BasicExpression::~BasicExpression() = default;
 CallExpression::~CallExpression() = default;
@@ -141,8 +169,11 @@ LoadExpression::~LoadExpression() = default;
 StoreExpression::~StoreExpression() = default;
 AggregateValueExpression::~AggregateValueExpression() = default;
 PHIExpression::~PHIExpression() = default;
-}
-}
+
+} // end namespace GVNExpression
+} // end namespace llvm
+
+namespace {
 
 // Tarjan's SCC finding algorithm with Nuutila's improvements
 // SCCIterator is actually fairly complex for the simple thing we want.
@@ -153,7 +184,6 @@ PHIExpression::~PHIExpression() = default;
 // instructions,
 // not generic values (arguments, etc).
 struct TarjanSCC {
-
   TarjanSCC() : Components(1) {}
 
   void Start(const Instruction *Start) {
@@ -208,15 +238,19 @@ private:
       Stack.push_back(I);
     }
   }
+
   unsigned int DFSNum = 1;
   SmallPtrSet<const Value *, 8> InComponent;
   DenseMap<const Value *, unsigned int> Root;
   SmallVector<const Value *, 8> Stack;
+
   // Store the components as vector of ptr sets, because we need the topo order
   // of SCC's, but not individual member order
   SmallVector<SmallPtrSet<const Value *, 8>, 8> Components;
+
   DenseMap<const Value *, unsigned> ValueToComponent;
 };
+
 // Congruence classes represent the set of expressions/instructions
 // that are all the same *during some scope in the function*.
 // That is, because of the way we perform equality propagation, and
@@ -265,7 +299,9 @@ public:
   explicit CongruenceClass(unsigned ID) : ID(ID) {}
   CongruenceClass(unsigned ID, Value *Leader, const Expression *E)
       : ID(ID), RepLeader(Leader), DefiningExpr(E) {}
+
   unsigned getID() const { return ID; }
+
   // True if this class has no members left.  This is mainly used for assertion
   // purposes, and for skipping empty classes.
   bool isDead() const {
@@ -273,6 +309,7 @@ public:
     // perspective, it's really dead.
     return empty() && memory_empty();
   }
+
   // Leader functions
   Value *getLeader() const { return RepLeader; }
   void setLeader(Value *Leader) { RepLeader = Leader; }
@@ -280,7 +317,6 @@ public:
     return NextLeader;
   }
   void resetNextLeader() { NextLeader = {nullptr, ~0}; }
-
   void addPossibleNextLeader(std::pair<Value *, unsigned int> LeaderPair) {
     if (LeaderPair.second < NextLeader.second)
       NextLeader = LeaderPair;
@@ -315,6 +351,7 @@ public:
   iterator_range<MemoryMemberSet::const_iterator> memory() const {
     return make_range(memory_begin(), memory_end());
   }
+
   void memory_insert(const MemoryMemberType *M) { MemoryMembers.insert(M); }
   void memory_erase(const MemoryMemberType *M) { MemoryMembers.erase(M); }
 
@@ -354,34 +391,48 @@ public:
 
 private:
   unsigned ID;
+
   // Representative leader.
   Value *RepLeader = nullptr;
+
   // The most dominating leader after our current leader, because the member set
   // is not sorted and is expensive to keep sorted all the time.
   std::pair<Value *, unsigned int> NextLeader = {nullptr, ~0U};
+
   // If this is represented by a store, the value of the store.
   Value *RepStoredValue = nullptr;
+
   // If this class contains MemoryDefs or MemoryPhis, this is the leading memory
   // access.
   const MemoryAccess *RepMemoryAccess = nullptr;
+
   // Defining Expression.
   const Expression *DefiningExpr = nullptr;
+
   // Actual members of this class.
   MemberSet Members;
+
   // This is the set of MemoryPhis that exist in the class. MemoryDefs and
   // MemoryUses have real instructions representing them, so we only need to
   // track MemoryPhis here.
   MemoryMemberSet MemoryMembers;
+
   // Number of stores in this congruence class.
   // This is used so we can detect store equivalence changes properly.
   int StoreCount = 0;
 };
 
+} // end anonymous namespace
+
 namespace llvm {
+
 struct ExactEqualsExpression {
   const Expression &E;
+
   explicit ExactEqualsExpression(const Expression &E) : E(E) {}
+
   hash_code getComputedHash() const { return E.getComputedHash(); }
+
   bool operator==(const Expression &Other) const {
     return E.exactlyEquals(Other);
   }
@@ -393,17 +444,21 @@ template <> struct DenseMapInfo<const Expression *> {
     Val <<= PointerLikeTypeTraits<const Expression *>::NumLowBitsAvailable;
     return reinterpret_cast<const Expression *>(Val);
   }
+
   static const Expression *getTombstoneKey() {
     auto Val = static_cast<uintptr_t>(~1U);
     Val <<= PointerLikeTypeTraits<const Expression *>::NumLowBitsAvailable;
     return reinterpret_cast<const Expression *>(Val);
   }
+
   static unsigned getHashValue(const Expression *E) {
     return E->getComputedHash();
   }
+
   static unsigned getHashValue(const ExactEqualsExpression &E) {
     return E.getComputedHash();
   }
+
   static bool isEqual(const ExactEqualsExpression &LHS, const Expression *RHS) {
     if (RHS == getTombstoneKey() || RHS == getEmptyKey())
       return false;
@@ -425,9 +480,11 @@ template <> struct DenseMapInfo<const Expression *> {
     return *LHS == *RHS;
   }
 };
+
 } // end namespace llvm
 
 namespace {
+
 class NewGVN {
   Function &F;
   DominatorTree *DT;
@@ -464,16 +521,22 @@ class NewGVN {
   // Value Mappings.
   DenseMap<Value *, CongruenceClass *> ValueToClass;
   DenseMap<Value *, const Expression *> ValueToExpression;
+
   // Value PHI handling, used to make equivalence between phi(op, op) and
   // op(phi, phi).
   // These mappings just store various data that would normally be part of the
   // IR.
-  DenseSet<const Instruction *> PHINodeUses;
+  SmallPtrSet<const Instruction *, 8> PHINodeUses;
+
+  DenseMap<const Value *, bool> OpSafeForPHIOfOps;
+
   // Map a temporary instruction we created to a parent block.
   DenseMap<const Value *, BasicBlock *> TempToBlock;
-  // Map between the temporary phis we created and the real instructions they
-  // are known equivalent to.
+
+  // Map between the already in-program instructions and the temporary phis we
+  // created that they are known equivalent to.
   DenseMap<const Value *, PHINode *> RealToTemp;
+
   // In order to know when we should re-process instructions that have
   // phi-of-ops, we track the set of expressions that they needed as
   // leaders. When we discover new leaders for those expressions, we process the
@@ -485,19 +548,32 @@ class NewGVN {
   mutable DenseMap<const Value *, SmallPtrSet<Value *, 2>> AdditionalUsers;
   DenseMap<const Expression *, SmallPtrSet<Instruction *, 2>>
       ExpressionToPhiOfOps;
-  // Map from basic block to the temporary operations we created
-  DenseMap<const BasicBlock *, SmallVector<PHINode *, 8>> PHIOfOpsPHIs;
+
   // Map from temporary operation to MemoryAccess.
   DenseMap<const Instruction *, MemoryUseOrDef *> TempToMemory;
+
   // Set of all temporary instructions we created.
+  // Note: This will include instructions that were just created during value
+  // numbering.  The way to test if something is using them is to check
+  // RealToTemp.
   DenseSet<Instruction *> AllTempInstructions;
 
+  // This is the set of instructions to revisit on a reachability change.  At
+  // the end of the main iteration loop it will contain at least all the phi of
+  // ops instructions that will be changed to phis, as well as regular phis.
+  // During the iteration loop, it may contain other things, such as phi of ops
+  // instructions that used edge reachability to reach a result, and so need to
+  // be revisited when the edge changes, independent of whether the phi they
+  // depended on changes.
+  DenseMap<BasicBlock *, SparseBitVector<>> RevisitOnReachabilityChange;
+
   // Mapping from predicate info we used to the instructions we used it with.
   // In order to correctly ensure propagation, we must keep track of what
   // comparisons we used, so that when the values of the comparisons change, we
   // propagate the information to the places we used the comparison.
   mutable DenseMap<const Value *, SmallPtrSet<Instruction *, 2>>
       PredicateToUsers;
+
   // the same reasoning as PredicateToUsers.  When we skip MemoryAccesses for
   // stores, we no longer can rely solely on the def-use chains of MemorySSA.
   mutable DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>>
@@ -525,6 +601,7 @@ class NewGVN {
 
   enum InstCycleState { ICS_Unknown, ICS_CycleFree, ICS_Cycle };
   mutable DenseMap<const Instruction *, InstCycleState> InstCycleState;
+
   // Expression to class mapping.
   using ExpressionClassMap = DenseMap<const Expression *, CongruenceClass *>;
   ExpressionClassMap ExpressionToClass;
@@ -581,6 +658,7 @@ public:
       : F(F), DT(DT), TLI(TLI), AA(AA), MSSA(MSSA), DL(DL),
         PredInfo(make_unique<PredicateInfo>(F, *DT, *AC)), SQ(DL, TLI, DT, AC) {
   }
+
   bool runGVN();
 
 private:
@@ -588,7 +666,13 @@ private:
   const Expression *createExpression(Instruction *) const;
   const Expression *createBinaryExpression(unsigned, Type *, Value *, Value *,
                                            Instruction *) const;
-  PHIExpression *createPHIExpression(Instruction *, bool &HasBackEdge,
+
+  // Our canonical form for phi arguments is a pair of incoming value, incoming
+  // basic block.
+  using ValPair = std::pair<Value *, BasicBlock *>;
+
+  PHIExpression *createPHIExpression(ArrayRef<ValPair>, const Instruction *,
+                                     BasicBlock *, bool &HasBackEdge,
                                      bool &OriginalOpsConstant) const;
   const DeadExpression *createDeadExpression() const;
   const VariableExpression *createVariableExpression(Value *) const;
@@ -617,6 +701,7 @@ private:
     CC->setMemoryLeader(MA);
     return CC;
   }
+
   CongruenceClass *ensureLeaderOfMemoryClass(MemoryAccess *MA) {
     auto *CC = getMemoryClass(MA);
     if (CC->getMemoryLeader() != MA)
@@ -630,10 +715,21 @@ private:
     ValueToClass[Member] = CClass;
     return CClass;
   }
+
   void initializeCongruenceClasses(Function &F);
-  const Expression *makePossiblePhiOfOps(Instruction *,
+  const Expression *makePossiblePHIOfOps(Instruction *,
                                          SmallPtrSetImpl<Value *> &);
+  Value *findLeaderForInst(Instruction *ValueOp,
+                           SmallPtrSetImpl<Value *> &Visited,
+                           MemoryAccess *MemAccess, Instruction *OrigInst,
+                           BasicBlock *PredBB);
+  bool OpIsSafeForPHIOfOpsHelper(Value *V, const BasicBlock *PHIBlock,
+                                 SmallPtrSetImpl<const Value *> &Visited,
+                                 SmallVectorImpl<Instruction *> &Worklist);
+  bool OpIsSafeForPHIOfOps(Value *Op, const BasicBlock *PHIBlock,
+                           SmallPtrSetImpl<const Value *> &);
   void addPhiOfOps(PHINode *Op, BasicBlock *BB, Instruction *ExistingValue);
+  void removePhiOfOps(Instruction *I, PHINode *PHITemp);
 
   // Value number an Instruction or MemoryPhi.
   void valueNumberMemoryPhi(MemoryPhi *);
@@ -650,7 +746,10 @@ private:
   const Expression *performSymbolicLoadEvaluation(Instruction *) const;
   const Expression *performSymbolicStoreEvaluation(Instruction *) const;
   const Expression *performSymbolicCallEvaluation(Instruction *) const;
-  const Expression *performSymbolicPHIEvaluation(Instruction *) const;
+  void sortPHIOps(MutableArrayRef<ValPair> Ops) const;
+  const Expression *performSymbolicPHIEvaluation(ArrayRef<ValPair>,
+                                                 Instruction *I,
+                                                 BasicBlock *PHIBlock) const;
   const Expression *performSymbolicAggrValueEvaluation(Instruction *) const;
   const Expression *performSymbolicCmpEvaluation(Instruction *) const;
   const Expression *performSymbolicPredicateInfoEvaluation(Instruction *) const;
@@ -658,6 +757,7 @@ private:
   // Congruence finding.
   bool someEquivalentDominates(const Instruction *, const Instruction *) const;
   Value *lookupOperandLeader(Value *) const;
+  CongruenceClass *getClassForExpression(const Expression *E) const;
   void performCongruenceFinding(Instruction *, const Expression *);
   void moveValueToNewCongruenceClass(Instruction *, const Expression *,
                                      CongruenceClass *, CongruenceClass *);
@@ -692,10 +792,11 @@ private:
   void replaceInstruction(Instruction *, Value *);
   void markInstructionForDeletion(Instruction *);
   void deleteInstructionsInBlock(BasicBlock *);
-  Value *findPhiOfOpsLeader(const Expression *E, const BasicBlock *BB) const;
+  Value *findPHIOfOpsLeader(const Expression *, const Instruction *,
+                            const BasicBlock *) const;
 
   // New instruction creation.
-  void handleNewInstruction(Instruction *){};
+  void handleNewInstruction(Instruction *) {}
 
   // Various instruction touch utilities
   template <typename Map, typename KeyType, typename Func>
@@ -731,6 +832,7 @@ private:
   MemoryAccess *getDefiningAccess(const MemoryAccess *) const;
   MemoryPhi *getMemoryAccess(const BasicBlock *) const;
   template <class T, class Range> T *getMinDFSOfRange(const Range &) const;
+
   unsigned InstrToDFSNum(const Value *V) const {
     assert(isa<Instruction>(V) && "This should not be used for MemoryAccesses");
     return InstrDFS.lookup(V);
@@ -739,7 +841,9 @@ private:
   unsigned InstrToDFSNum(const MemoryAccess *MA) const {
     return MemoryToDFSNum(MA);
   }
+
   Value *InstrFromDFSNum(unsigned DFSNum) { return DFSToInstr[DFSNum]; }
+
   // Given a MemoryAccess, return the relevant instruction DFS number.  Note:
   // This deliberately takes a value so it can be used with Use's, which will
   // auto-convert to Value's but not to MemoryAccess's.
@@ -750,12 +854,15 @@ private:
                ? InstrToDFSNum(cast<MemoryUseOrDef>(MA)->getMemoryInst())
                : InstrDFS.lookup(MA);
   }
+
   bool isCycleFree(const Instruction *) const;
   bool isBackedge(BasicBlock *From, BasicBlock *To) const;
+
   // Debug counter info.  When verifying, we have to reset the value numbering
   // debug counter to the same state it started in to get the same results.
   std::pair<int, int> StartingVNCounter;
 };
+
 } // end anonymous namespace
 
 template <typename T>
@@ -781,11 +888,9 @@ bool StoreExpression::equals(const Expression &Other) const {
 
 // Determine if the edge From->To is a backedge
 bool NewGVN::isBackedge(BasicBlock *From, BasicBlock *To) const {
-  if (From == To)
-    return true;
-  auto *FromDTN = DT->getNode(From);
-  auto *ToDTN = DT->getNode(To);
-  return RPOOrdering.lookup(FromDTN) >= RPOOrdering.lookup(ToDTN);
+  return From == To ||
+         RPOOrdering.lookup(DT->getNode(From)) >=
+             RPOOrdering.lookup(DT->getNode(To));
 }
 
 #ifndef NDEBUG
@@ -830,51 +935,77 @@ void NewGVN::deleteExpression(const Expression *E) const {
   const_cast<BasicExpression *>(BE)->deallocateOperands(ArgRecycler);
   ExpressionAllocator.Deallocate(E);
 }
-PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge,
+
+// If V is a predicateinfo copy, get the thing it is a copy of.
+static Value *getCopyOf(const Value *V) {
+  if (auto *II = dyn_cast<IntrinsicInst>(V))
+    if (II->getIntrinsicID() == Intrinsic::ssa_copy)
+      return II->getOperand(0);
+  return nullptr;
+}
+
+// Return true if V is really PN, even accounting for predicateinfo copies.
+static bool isCopyOfPHI(const Value *V, const PHINode *PN) {
+  return V == PN || getCopyOf(V) == PN;
+}
+
+static bool isCopyOfAPHI(const Value *V) {
+  auto *CO = getCopyOf(V);
+  return CO && isa<PHINode>(CO);
+}
+
+// Sort PHI Operands into a canonical order.  What we use here is an RPO
+// order. The BlockInstRange numbers are generated in an RPO walk of the basic
+// blocks.
+void NewGVN::sortPHIOps(MutableArrayRef<ValPair> Ops) const {
+  std::sort(Ops.begin(), Ops.end(), [&](const ValPair &P1, const ValPair &P2) {
+    return BlockInstRange.lookup(P1.second).first <
+           BlockInstRange.lookup(P2.second).first;
+  });
+}
+
+// Return true if V is a value that will always be available (IE can
+// be placed anywhere) in the function.  We don't do globals here
+// because they are often worse to put in place.
+static bool alwaysAvailable(Value *V) {
+  return isa<Constant>(V) || isa<Argument>(V);
+}
+
+// Create a PHIExpression from an array of {incoming edge, value} pairs.  I is
+// the original instruction we are creating a PHIExpression for (but may not be
+// a phi node). We require, as an invariant, that all the PHIOperands in the
+// same block are sorted the same way. sortPHIOps will sort them into a
+// canonical order.
+PHIExpression *NewGVN::createPHIExpression(ArrayRef<ValPair> PHIOperands,
+                                           const Instruction *I,
+                                           BasicBlock *PHIBlock,
+                                           bool &HasBackedge,
                                            bool &OriginalOpsConstant) const {
-  BasicBlock *PHIBlock = getBlockForValue(I);
-  auto *PN = cast<PHINode>(I);
-  auto *E =
-      new (ExpressionAllocator) PHIExpression(PN->getNumOperands(), PHIBlock);
+  unsigned NumOps = PHIOperands.size();
+  auto *E = new (ExpressionAllocator) PHIExpression(NumOps, PHIBlock);
 
   E->allocateOperands(ArgRecycler, ExpressionAllocator);
-  E->setType(I->getType());
-  E->setOpcode(I->getOpcode());
-
-  // NewGVN assumes the operands of a PHI node are in a consistent order across
-  // PHIs. LLVM doesn't seem to always guarantee this. While we need to fix
-  // this in LLVM at some point we don't want GVN to find wrong congruences.
-  // Therefore, here we sort uses in predecessor order.
-  // We're sorting the values by pointer. In theory this might be cause of
-  // non-determinism, but here we don't rely on the ordering for anything
-  // significant, e.g. we don't create new instructions based on it so we're
-  // fine.
-  SmallVector<const Use *, 4> PHIOperands;
-  for (const Use &U : PN->operands())
-    PHIOperands.push_back(&U);
-  std::sort(PHIOperands.begin(), PHIOperands.end(),
-            [&](const Use *U1, const Use *U2) {
-              return PN->getIncomingBlock(*U1) < PN->getIncomingBlock(*U2);
-            });
+  E->setType(PHIOperands.begin()->first->getType());
+  E->setOpcode(Instruction::PHI);
 
   // Filter out unreachable phi operands.
-  auto Filtered = make_filter_range(PHIOperands, [&](const Use *U) {
-    if (*U == PN)
-      return false;
-    if (!ReachableEdges.count({PN->getIncomingBlock(*U), PHIBlock}))
+  auto Filtered = make_filter_range(PHIOperands, [&](const ValPair &P) {
+    auto *BB = P.second;
+    if (auto *PHIOp = dyn_cast<PHINode>(I))
+      if (isCopyOfPHI(P.first, PHIOp))
+        return false;
+    if (!ReachableEdges.count({BB, PHIBlock}))
       return false;
     // Things in TOPClass are equivalent to everything.
-    if (ValueToClass.lookup(*U) == TOPClass)
+    if (ValueToClass.lookup(P.first) == TOPClass)
       return false;
-    return lookupOperandLeader(*U) != PN;
+    OriginalOpsConstant = OriginalOpsConstant && isa<Constant>(P.first);
+    HasBackedge = HasBackedge || isBackedge(BB, PHIBlock);
+    return lookupOperandLeader(P.first) != I;
   });
   std::transform(Filtered.begin(), Filtered.end(), op_inserter(E),
-                 [&](const Use *U) -> Value * {
-                   auto *BB = PN->getIncomingBlock(*U);
-                   HasBackedge = HasBackedge || isBackedge(BB, PHIBlock);
-                   OriginalOpsConstant =
-                       OriginalOpsConstant && isa<Constant>(*U);
-                   return lookupOperandLeader(*U);
+                 [&](const ValPair &P) -> Value * {
+                   return lookupOperandLeader(P.first);
                  });
   return E;
 }
@@ -929,8 +1060,6 @@ const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
 // Take a Value returned by simplification of Expression E/Instruction
 // I, and see if it resulted in a simpler expression. If so, return
 // that expression.
-// TODO: Once finished, this should not take an Instruction, we only
-// use it for printing.
 const Expression *NewGVN::checkSimplificationResults(Expression *E,
                                                      Instruction *I,
                                                      Value *V) const {
@@ -954,25 +1083,37 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E,
   }
 
   CongruenceClass *CC = ValueToClass.lookup(V);
-  if (CC && CC->getDefiningExpr()) {
-    // If we simplified to something else, we need to communicate
-    // that we're users of the value we simplified to.
-    if (I != V) {
+  if (CC) {
+    if (CC->getLeader() && CC->getLeader() != I) {
       // Don't add temporary instructions to the user lists.
       if (!AllTempInstructions.count(I))
         addAdditionalUsers(V, I);
+      return createVariableOrConstant(CC->getLeader());
     }
+    if (CC->getDefiningExpr()) {
+      // If we simplified to something else, we need to communicate
+      // that we're users of the value we simplified to.
+      if (I != V) {
+        // Don't add temporary instructions to the user lists.
+        if (!AllTempInstructions.count(I))
+          addAdditionalUsers(V, I);
+      }
 
-    if (I)
-      DEBUG(dbgs() << "Simplified " << *I << " to "
-                   << " expression " << *CC->getDefiningExpr() << "\n");
-    NumGVNOpsSimplified++;
-    deleteExpression(E);
-    return CC->getDefiningExpr();
+      if (I)
+        DEBUG(dbgs() << "Simplified " << *I << " to "
+                     << " expression " << *CC->getDefiningExpr() << "\n");
+      NumGVNOpsSimplified++;
+      deleteExpression(E);
+      return CC->getDefiningExpr();
+    }
   }
+
   return nullptr;
 }
 
+// Create a value expression from the instruction I, replacing operands with
+// their leaders.
+
 const Expression *NewGVN::createExpression(Instruction *I) const {
   auto *E = new (ExpressionAllocator) BasicExpression(I->getNumOperands());
 
@@ -987,15 +1128,7 @@ const Expression *NewGVN::createExpression(Instruction *I) const {
     if (shouldSwapOperands(E->getOperand(0), E->getOperand(1)))
       E->swapOperands(0, 1);
   }
-
-  // Perform simplificaiton
-  // TODO: Right now we only check to see if we get a constant result.
-  // We may get a less than constant, but still better, result for
-  // some operations.
-  // IE
-  //  add 0, x -> x
-  //  and x, x -> x
-  // We should handle this by simply rewriting the expression.
+  // Perform simplification.
   if (auto *CI = dyn_cast<CmpInst>(I)) {
     // Sort the operand value numbers so x<y and y>x get the same value
     // number.
@@ -1016,7 +1149,7 @@ const Expression *NewGVN::createExpression(Instruction *I) const {
       return SimplifiedE;
   } else if (isa<SelectInst>(I)) {
     if (isa<Constant>(E->getOperand(0)) ||
-        E->getOperand(0) == E->getOperand(1)) {
+        E->getOperand(1) == E->getOperand(2)) {
       assert(E->getOperand(1)->getType() == I->getOperand(1)->getType() &&
              E->getOperand(2)->getType() == I->getOperand(2)->getType());
       Value *V = SimplifySelectInst(E->getOperand(0), E->getOperand(1),
@@ -1121,7 +1254,7 @@ NewGVN::createCallExpression(CallInst *CI, const MemoryAccess *MA) const {
 bool NewGVN::someEquivalentDominates(const Instruction *Inst,
                                      const Instruction *U) const {
   auto *CC = ValueToClass.lookup(Inst);
-  // This must be an instruction because we are only called from phi nodes
+   // This must be an instruction because we are only called from phi nodes
   // in the case that the value it needs to check against is an instruction.
 
   // The most likely candiates for dominance are the leader and the next leader.
@@ -1139,6 +1272,8 @@ bool NewGVN::someEquivalentDominates(const Instruction *Inst,
   // any of these siblings.
   if (!CC)
     return false;
+  if (alwaysAvailable(CC->getLeader()))
+    return true;
   if (DT->dominates(cast<Instruction>(CC->getLeader()), U))
     return true;
   if (CC->getNextLeader().first &&
@@ -1229,9 +1364,9 @@ const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const {
   if (EnableStoreRefinement)
     StoreRHS = MSSAWalker->getClobberingMemoryAccess(StoreAccess);
   // If we bypassed the use-def chains, make sure we add a use.
+  StoreRHS = lookupMemoryLeader(StoreRHS);
   if (StoreRHS != StoreAccess->getDefiningAccess())
     addMemoryUsers(StoreRHS, StoreAccess);
-  StoreRHS = lookupMemoryLeader(StoreRHS);
   // If we are defined by ourselves, use the live on entry def.
   if (StoreRHS == StoreAccess)
     StoreRHS = MSSA->getLiveOnEntryDef();
@@ -1278,7 +1413,7 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
   if (auto *DepSI = dyn_cast<StoreInst>(DepInst)) {
     // Can't forward from non-atomic to atomic without violating memory model.
     // Also don't need to coerce if they are the same type, we will just
-    // propogate..
+    // propagate.
     if (LI->isAtomic() > DepSI->isAtomic() ||
         LoadType == DepSI->getValueOperand()->getType())
       return nullptr;
@@ -1292,14 +1427,13 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
             getConstantStoreValueForLoad(C, Offset, LoadType, DL));
       }
     }
-
-  } else if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInst)) {
+  } else if (auto *DepLI = dyn_cast<LoadInst>(DepInst)) {
     // Can't forward from non-atomic to atomic without violating memory model.
     if (LI->isAtomic() > DepLI->isAtomic())
       return nullptr;
     int Offset = analyzeLoadFromClobberingLoad(LoadType, LoadPtr, DepLI, DL);
     if (Offset >= 0) {
-      // We can coerce a constant load into a load
+      // We can coerce a constant load into a load.
       if (auto *C = dyn_cast<Constant>(lookupOperandLeader(DepLI)))
         if (auto *PossibleConstant =
                 getConstantLoadValueForLoad(C, Offset, LoadType, DL)) {
@@ -1308,8 +1442,7 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
           return createConstantExpression(PossibleConstant);
         }
     }
-
-  } else if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInst)) {
+  } else if (auto *DepMI = dyn_cast<MemIntrinsic>(DepInst)) {
     int Offset = analyzeLoadFromClobberingMemInst(LoadType, LoadPtr, DepMI, DL);
     if (Offset >= 0) {
       if (auto *PossibleConstant =
@@ -1381,9 +1514,13 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const {
     }
   }
 
-  const Expression *E = createLoadExpression(LI->getType(), LoadAddressLeader,
-                                             LI, DefiningAccess);
-  return E;
+  const auto *LE = createLoadExpression(LI->getType(), LoadAddressLeader, LI,
+                                        DefiningAccess);
+  // If our MemoryLeader is not our defining access, add a use to the
+  // MemoryLeader, so that we get reprocessed when it changes.
+  if (LE->getMemoryLeader() != DefiningAccess)
+    addMemoryUsers(LE->getMemoryLeader(), OriginalAccess);
+  return LE;
 }
 
 const Expression *
@@ -1402,7 +1539,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
   auto *Cond = PWC->Condition;
 
   // If this a copy of the condition, it must be either true or false depending
-  // on the predicate info type and edge
+  // on the predicate info type and edge.
   if (CopyOf == Cond) {
     // We should not need to add predicate users because the predicate info is
     // already a use of this operand.
@@ -1438,7 +1575,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
   Value *FirstOp = lookupOperandLeader(Cmp->getOperand(0));
   Value *SecondOp = lookupOperandLeader(Cmp->getOperand(1));
   bool SwappedOps = false;
-  // Sort the ops
+  // Sort the ops.
   if (shouldSwapOperands(FirstOp, SecondOp)) {
     std::swap(FirstOp, SecondOp);
     SwappedOps = true;
@@ -1464,7 +1601,8 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
     if ((PBranch->TrueEdge && Predicate == CmpInst::ICMP_EQ) ||
         (!PBranch->TrueEdge && Predicate == CmpInst::ICMP_NE)) {
       addPredicateUsers(PI, I);
-      addAdditionalUsers(Cmp->getOperand(0), I);
+      addAdditionalUsers(SwappedOps ? Cmp->getOperand(1) : Cmp->getOperand(0),
+                         I);
       return createVariableOrConstant(FirstOp);
     }
     // Handle the special case of floating point.
@@ -1472,7 +1610,8 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
          (!PBranch->TrueEdge && Predicate == CmpInst::FCMP_UNE)) &&
         isa<ConstantFP>(FirstOp) && !cast<ConstantFP>(FirstOp)->isZero()) {
       addPredicateUsers(PI, I);
-      addAdditionalUsers(Cmp->getOperand(0), I);
+      addAdditionalUsers(SwappedOps ? Cmp->getOperand(1) : Cmp->getOperand(0),
+                         I);
       return createConstantExpression(cast<Constant>(FirstOp));
     }
   }
@@ -1502,7 +1641,6 @@ const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const {
 
 // Retrieve the memory class for a given MemoryAccess.
 CongruenceClass *NewGVN::getMemoryClass(const MemoryAccess *MA) const {
-
   auto *Result = MemoryAccessToClass.lookup(MA);
   assert(Result && "Should have found memory class");
   return Result;
@@ -1571,8 +1709,9 @@ bool NewGVN::isCycleFree(const Instruction *I) const {
     if (SCC.size() == 1)
       InstCycleState.insert({I, ICS_CycleFree});
     else {
-      bool AllPhis =
-          llvm::all_of(SCC, [](const Value *V) { return isa<PHINode>(V); });
+      bool AllPhis = llvm::all_of(SCC, [](const Value *V) {
+        return isa<PHINode>(V) || isCopyOfAPHI(V);
+      });
       ICS = AllPhis ? ICS_CycleFree : ICS_Cycle;
       for (auto *Member : SCC)
         if (auto *MemberPhi = dyn_cast<PHINode>(Member))
@@ -1584,17 +1723,20 @@ bool NewGVN::isCycleFree(const Instruction *I) const {
   return true;
 }
 
-// Evaluate PHI nodes symbolically, and create an expression result.
-const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const {
+// Evaluate PHI nodes symbolically and create an expression result.
+const Expression *
+NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps,
+                                     Instruction *I,
+                                     BasicBlock *PHIBlock) const {
   // True if one of the incoming phi edges is a backedge.
   bool HasBackedge = false;
   // All constant tracks the state of whether all the *original* phi operands
   // This is really shorthand for "this phi cannot cycle due to forward
   // change in value of the phi is guaranteed not to later change the value of
   // the phi. IE it can't be v = phi(undef, v+1)
-  bool AllConstant = true;
-  auto *E =
-      cast<PHIExpression>(createPHIExpression(I, HasBackedge, AllConstant));
+  bool OriginalOpsConstant = true;
+  auto *E = cast<PHIExpression>(createPHIExpression(
+      PHIOps, I, PHIBlock, HasBackedge, OriginalOpsConstant));
   // We match the semantics of SimplifyPhiNode from InstructionSimplify here.
   // See if all arguments are the same.
   // We track if any were undef because they need special handling.
@@ -1620,14 +1762,10 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const {
     deleteExpression(E);
     return createDeadExpression();
   }
-  unsigned NumOps = 0;
   Value *AllSameValue = *(Filtered.begin());
   ++Filtered.begin();
   // Can't use std::equal here, sadly, because filter.begin moves.
-  if (llvm::all_of(Filtered, [&](Value *Arg) {
-        ++NumOps;
-        return Arg == AllSameValue;
-      })) {
+  if (llvm::all_of(Filtered, [&](Value *Arg) { return Arg == AllSameValue; })) {
     // In LLVM's non-standard representation of phi nodes, it's possible to have
     // phi nodes with cycles (IE dependent on other phis that are .... dependent
     // on the original phi node), especially in weird CFG's where some arguments
@@ -1642,9 +1780,8 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const {
       // multivalued phi, and we need to know if it's cycle free in order to
       // evaluate whether we can ignore the undef.  The other parts of this are
       // just shortcuts.  If there is no backedge, or all operands are
-      // constants, or all operands are ignored but the undef, it also must be
-      // cycle free.
-      if (!AllConstant && HasBackedge && NumOps > 0 &&
+      // constants, it also must be cycle free.
+      if (HasBackedge && !OriginalOpsConstant &&
           !isa<UndefValue>(AllSameValue) && !isCycleFree(I))
         return E;
 
@@ -1708,8 +1845,11 @@ NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) const {
 
   return createAggregateValueExpression(I);
 }
+
 const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {
-  auto *CI = dyn_cast<CmpInst>(I);
+  assert(isa<CmpInst>(I) && "Expected a cmp instruction.");
+
+  auto *CI = cast<CmpInst>(I);
   // See if our operands are equal to those of a previous predicate, and if so,
   // if it implies true or false.
   auto Op0 = lookupOperandLeader(CI->getOperand(0));
@@ -1720,7 +1860,7 @@ const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {
     OurPredicate = CI->getSwappedPredicate();
   }
 
-  // Avoid processing the same info twice
+  // Avoid processing the same info twice.
   const PredicateBase *LastPredInfo = nullptr;
   // See if we know something about the comparison itself, like it is the target
   // of an assume.
@@ -1754,7 +1894,7 @@ const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {
   // %operands are considered users of the icmp.
 
   // *Currently* we only check one level of comparisons back, and only mark one
-  // level back as touched when changes appen .  If you modify this code to look
+  // level back as touched when changes happen.  If you modify this code to look
   // back farther through comparisons, you *must* mark the appropriate
   // comparisons as users in PredicateInfo.cpp, or you will cause bugs.  See if
   // we know something just from the operands themselves
@@ -1767,10 +1907,15 @@ const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {
       if (PI == LastPredInfo)
         continue;
       LastPredInfo = PI;
-
-      // TODO: Along the false edge, we may know more things too, like icmp of
+      // In phi of ops cases, we may have predicate info that we are evaluating
+      // in a different context.
+      if (!DT->dominates(PBranch->To, getBlockForValue(I)))
+        continue;
+      // TODO: Along the false edge, we may know more things too, like
+      // icmp of
       // same operands is false.
-      // TODO: We only handle actual comparison conditions below, not and/or.
+      // TODO: We only handle actual comparison conditions below, not
+      // and/or.
       auto *BranchCond = dyn_cast<CmpInst>(PBranch->Condition);
       if (!BranchCond)
         continue;
@@ -1798,7 +1943,6 @@ const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {
             return createConstantExpression(
                 ConstantInt::getFalse(CI->getType()));
           }
-
         } else {
           // Just handle the ne and eq cases, where if we have the same
           // operands, we may know something.
@@ -1822,14 +1966,6 @@ const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {
   return createExpression(I);
 }
 
-// Return true if V is a value that will always be available (IE can
-// be placed anywhere) in the function.  We don't do globals here
-// because they are often worse to put in place.
-// TODO: Separate cost from availability
-static bool alwaysAvailable(Value *V) {
-  return isa<Constant>(V) || isa<Argument>(V);
-}
-
 // Substitute and symbolize the value before value numbering.
 const Expression *
 NewGVN::performSymbolicEvaluation(Value *V,
@@ -1849,9 +1985,15 @@ NewGVN::performSymbolicEvaluation(Value *V,
     case Instruction::InsertValue:
       E = performSymbolicAggrValueEvaluation(I);
       break;
-    case Instruction::PHI:
-      E = performSymbolicPHIEvaluation(I);
-      break;
+    case Instruction::PHI: {
+      SmallVector<ValPair, 3> Ops;
+      auto *PN = cast<PHINode>(I);
+      for (unsigned i = 0; i < PN->getNumOperands(); ++i)
+        Ops.push_back({PN->getIncomingValue(i), PN->getIncomingBlock(i)});
+      // Sort to ensure the invariant createPHIExpression requires is met.
+      sortPHIOps(Ops);
+      E = performSymbolicPHIEvaluation(Ops, I, getBlockForValue(I));
+    } break;
     case Instruction::Call:
       E = performSymbolicCallEvaluation(I);
       break;
@@ -1861,13 +2003,13 @@ NewGVN::performSymbolicEvaluation(Value *V,
     case Instruction::Load:
       E = performSymbolicLoadEvaluation(I);
       break;
-    case Instruction::BitCast: {
+    case Instruction::BitCast: 
       E = createExpression(I);
-    } break;
+      break;
     case Instruction::ICmp:
-    case Instruction::FCmp: {
+    case Instruction::FCmp:
       E = performSymbolicCmpEvaluation(I);
-    } break;
+      break;
     case Instruction::Add:
     case Instruction::FAdd:
     case Instruction::Sub:
@@ -2017,7 +2159,7 @@ T *NewGVN::getMinDFSOfRange(const Range &R) const {
 const MemoryAccess *NewGVN::getNextMemoryLeader(CongruenceClass *CC) const {
   // TODO: If this ends up to slow, we can maintain a next memory leader like we
   // do for regular leaders.
-  // Make sure there will be a leader to find
+  // Make sure there will be a leader to find.
   assert(!CC->definesNoMemory() && "Can't get next leader if there is none");
   if (CC->getStoreCount() > 0) {
     if (auto *NL = dyn_cast_or_null<StoreInst>(CC->getNextLeader().first))
@@ -2194,7 +2336,7 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
 // For a given expression, mark the phi of ops instructions that could have
 // changed as a result.
 void NewGVN::markPhiOfOpsChanged(const Expression *E) {
-  touchAndErase(ExpressionToPhiOfOps, ExactEqualsExpression(*E));
+  touchAndErase(ExpressionToPhiOfOps, E);
 }
 
 // Perform congruence finding on a given value numbering expression.
@@ -2315,14 +2457,11 @@ void NewGVN::updateReachableEdge(BasicBlock *From, BasicBlock *To) {
       if (MemoryAccess *MemPhi = getMemoryAccess(To))
         TouchedInstructions.set(InstrToDFSNum(MemPhi));
 
-      auto BI = To->begin();
-      while (isa<PHINode>(BI)) {
-        TouchedInstructions.set(InstrToDFSNum(&*BI));
-        ++BI;
-      }
-      for_each_found(PHIOfOpsPHIs, To, [&](const PHINode *I) {
-        TouchedInstructions.set(InstrToDFSNum(I));
-      });
+      // FIXME: We should just add a union op on a Bitvector and
+      // SparseBitVector.  We can do it word by word faster than we are doing it
+      // here.
+      for (auto InstNum : RevisitOnReachabilityChange[To])
+        TouchedInstructions.set(InstNum);
     }
   }
 }
@@ -2419,24 +2558,146 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
   }
 }
 
+// Remove the PHI of Ops PHI for I
+void NewGVN::removePhiOfOps(Instruction *I, PHINode *PHITemp) {
+  InstrDFS.erase(PHITemp);
+  // It's still a temp instruction. We keep it in the array so it gets erased.
+  // However, it's no longer used by I, or in the block
+  TempToBlock.erase(PHITemp);
+  RealToTemp.erase(I);
+  // We don't remove the users from the phi node uses. This wastes a little
+  // time, but such is life.  We could use two sets to track which were there
+  // are the start of NewGVN, and which were added, but right nowt he cost of
+  // tracking is more than the cost of checking for more phi of ops.
+}
+
+// Add PHI Op in BB as a PHI of operations version of ExistingValue.
 void NewGVN::addPhiOfOps(PHINode *Op, BasicBlock *BB,
                          Instruction *ExistingValue) {
   InstrDFS[Op] = InstrToDFSNum(ExistingValue);
   AllTempInstructions.insert(Op);
-  PHIOfOpsPHIs[BB].push_back(Op);
   TempToBlock[Op] = BB;
   RealToTemp[ExistingValue] = Op;
+  // Add all users to phi node use, as they are now uses of the phi of ops phis
+  // and may themselves be phi of ops.
+  for (auto *U : ExistingValue->users())
+    if (auto *UI = dyn_cast<Instruction>(U))
+      PHINodeUses.insert(UI);
 }
 
 static bool okayForPHIOfOps(const Instruction *I) {
+  if (!EnablePhiOfOps)
+    return false;
   return isa<BinaryOperator>(I) || isa<SelectInst>(I) || isa<CmpInst>(I) ||
          isa<LoadInst>(I);
 }
 
+bool NewGVN::OpIsSafeForPHIOfOpsHelper(
+    Value *V, const BasicBlock *PHIBlock,
+    SmallPtrSetImpl<const Value *> &Visited,
+    SmallVectorImpl<Instruction *> &Worklist) {
+
+  if (!isa<Instruction>(V))
+    return true;
+  auto OISIt = OpSafeForPHIOfOps.find(V);
+  if (OISIt != OpSafeForPHIOfOps.end())
+    return OISIt->second;
+
+  // Keep walking until we either dominate the phi block, or hit a phi, or run
+  // out of things to check.
+  if (DT->properlyDominates(getBlockForValue(V), PHIBlock)) {
+    OpSafeForPHIOfOps.insert({V, true});
+    return true;
+  }
+  // PHI in the same block.
+  if (isa<PHINode>(V) && getBlockForValue(V) == PHIBlock) {
+    OpSafeForPHIOfOps.insert({V, false});
+    return false;
+  }
+
+  auto *OrigI = cast<Instruction>(V);
+  for (auto *Op : OrigI->operand_values()) {
+    if (!isa<Instruction>(Op))
+      continue;
+    // Stop now if we find an unsafe operand.
+    auto OISIt = OpSafeForPHIOfOps.find(OrigI);
+    if (OISIt != OpSafeForPHIOfOps.end()) {
+      if (!OISIt->second) {
+        OpSafeForPHIOfOps.insert({V, false});
+        return false;
+      }
+      continue;
+    }
+    if (!Visited.insert(Op).second)
+      continue;
+    Worklist.push_back(cast<Instruction>(Op));
+  }
+  return true;
+}
+
+// Return true if this operand will be safe to use for phi of ops.
+//
+// The reason some operands are unsafe is that we are not trying to recursively
+// translate everything back through phi nodes.  We actually expect some lookups
+// of expressions to fail.  In particular, a lookup where the expression cannot
+// exist in the predecessor.  This is true even if the expression, as shown, can
+// be determined to be constant.
+bool NewGVN::OpIsSafeForPHIOfOps(Value *V, const BasicBlock *PHIBlock,
+                                 SmallPtrSetImpl<const Value *> &Visited) {
+  SmallVector<Instruction *, 4> Worklist;
+  if (!OpIsSafeForPHIOfOpsHelper(V, PHIBlock, Visited, Worklist))
+    return false;
+  while (!Worklist.empty()) {
+    auto *I = Worklist.pop_back_val();
+    if (!OpIsSafeForPHIOfOpsHelper(I, PHIBlock, Visited, Worklist))
+      return false;
+  }
+  OpSafeForPHIOfOps.insert({V, true});
+  return true;
+}
+
+// Try to find a leader for instruction TransInst, which is a phi translated
+// version of something in our original program.  Visited is used to ensure we
+// don't infinite loop during translations of cycles.  OrigInst is the
+// instruction in the original program, and PredBB is the predecessor we
+// translated it through.
+Value *NewGVN::findLeaderForInst(Instruction *TransInst,
+                                 SmallPtrSetImpl<Value *> &Visited,
+                                 MemoryAccess *MemAccess, Instruction *OrigInst,
+                                 BasicBlock *PredBB) {
+  unsigned IDFSNum = InstrToDFSNum(OrigInst);
+  // Make sure it's marked as a temporary instruction.
+  AllTempInstructions.insert(TransInst);
+  // and make sure anything that tries to add it's DFS number is
+  // redirected to the instruction we are making a phi of ops
+  // for.
+  TempToBlock.insert({TransInst, PredBB});
+  InstrDFS.insert({TransInst, IDFSNum});
+
+  const Expression *E = performSymbolicEvaluation(TransInst, Visited);
+  InstrDFS.erase(TransInst);
+  AllTempInstructions.erase(TransInst);
+  TempToBlock.erase(TransInst);
+  if (MemAccess)
+    TempToMemory.erase(TransInst);
+  if (!E)
+    return nullptr;
+  auto *FoundVal = findPHIOfOpsLeader(E, OrigInst, PredBB);
+  if (!FoundVal) {
+    ExpressionToPhiOfOps[E].insert(OrigInst);
+    DEBUG(dbgs() << "Cannot find phi of ops operand for " << *TransInst
+                 << " in block " << getBlockName(PredBB) << "\n");
+    return nullptr;
+  }
+  if (auto *SI = dyn_cast<StoreInst>(FoundVal))
+    FoundVal = SI->getValueOperand();
+  return FoundVal;
+}
+
 // When we see an instruction that is an op of phis, generate the equivalent phi
 // of ops form.
 const Expression *
-NewGVN::makePossiblePhiOfOps(Instruction *I,
+NewGVN::makePossiblePHIOfOps(Instruction *I,
                              SmallPtrSetImpl<Value *> &Visited) {
   if (!okayForPHIOfOps(I))
     return nullptr;
@@ -2450,7 +2711,6 @@ NewGVN::makePossiblePhiOfOps(Instruction *I,
   if (!isCycleFree(I))
     return nullptr;
 
-  unsigned IDFSNum = InstrToDFSNum(I);
   SmallPtrSet<const Value *, 8> ProcessedPHIs;
   // TODO: We don't do phi translation on memory accesses because it's
   // complicated. For a load, we'd need to be able to simulate a new memoryuse,
@@ -2463,81 +2723,94 @@ NewGVN::makePossiblePhiOfOps(Instruction *I,
       MemAccess->getDefiningAccess()->getBlock() == I->getParent())
     return nullptr;
 
+  SmallPtrSet<const Value *, 10> VisitedOps;
   // Convert op of phis to phi of ops
-  for (auto &Op : I->operands()) {
-    // TODO: We can't handle expressions that must be recursively translated
-    // IE
-    // a = phi (b, c)
-    // f = use a
-    // g = f + phi of something
-    // To properly make a phi of ops for g, we'd have to properly translate and
-    // use the instruction for f.  We should add this by splitting out the
-    // instruction creation we do below.
-    if (isa<Instruction>(Op) && PHINodeUses.count(cast<Instruction>(Op)))
-      return nullptr;
-    if (!isa<PHINode>(Op))
-      continue;
+  for (auto *Op : I->operand_values()) {
+    if (!isa<PHINode>(Op)) {
+      auto *ValuePHI = RealToTemp.lookup(Op);
+      if (!ValuePHI)
+        continue;
+      DEBUG(dbgs() << "Found possible dependent phi of ops\n");
+      Op = ValuePHI;
+    }
     auto *OpPHI = cast<PHINode>(Op);
     // No point in doing this for one-operand phis.
     if (OpPHI->getNumOperands() == 1)
       continue;
     if (!DebugCounter::shouldExecute(PHIOfOpsCounter))
       return nullptr;
-    SmallVector<std::pair<Value *, BasicBlock *>, 4> Ops;
+    SmallVector<ValPair, 4> Ops;
+    SmallPtrSet<Value *, 4> Deps;
     auto *PHIBlock = getBlockForValue(OpPHI);
-    for (auto PredBB : OpPHI->blocks()) {
+    RevisitOnReachabilityChange[PHIBlock].reset(InstrToDFSNum(I));
+    for (unsigned PredNum = 0; PredNum < OpPHI->getNumOperands(); ++PredNum) {
+      auto *PredBB = OpPHI->getIncomingBlock(PredNum);
       Value *FoundVal = nullptr;
       // We could just skip unreachable edges entirely but it's tricky to do
       // with rewriting existing phi nodes.
       if (ReachableEdges.count({PredBB, PHIBlock})) {
-        // Clone the instruction, create an expression from it, and see if we
-        // have a leader.
+        // Clone the instruction, create an expression from it that is
+        // translated back into the predecessor, and see if we have a leader.
         Instruction *ValueOp = I->clone();
         if (MemAccess)
           TempToMemory.insert({ValueOp, MemAccess});
-
+        bool SafeForPHIOfOps = true;
+        VisitedOps.clear();
         for (auto &Op : ValueOp->operands()) {
-          Op = Op->DoPHITranslation(PHIBlock, PredBB);
-          // When this operand changes, it could change whether there is a
-          // leader for us or not.
-          addAdditionalUsers(Op, I);
+          auto *OrigOp = &*Op;
+          // When these operand changes, it could change whether there is a
+          // leader for us or not, so we have to add additional users.
+          if (isa<PHINode>(Op)) {
+            Op = Op->DoPHITranslation(PHIBlock, PredBB);
+            if (Op != OrigOp && Op != I)
+              Deps.insert(Op);
+          } else if (auto *ValuePHI = RealToTemp.lookup(Op)) {
+            if (getBlockForValue(ValuePHI) == PHIBlock)
+              Op = ValuePHI->getIncomingValueForBlock(PredBB);
+          }
+          // If we phi-translated the op, it must be safe.
+          SafeForPHIOfOps =
+              SafeForPHIOfOps &&
+              (Op != OrigOp || OpIsSafeForPHIOfOps(Op, PHIBlock, VisitedOps));
         }
-        // Make sure it's marked as a temporary instruction.
-        AllTempInstructions.insert(ValueOp);
-        // and make sure anything that tries to add it's DFS number is
-        // redirected to the instruction we are making a phi of ops
-        // for.
-        InstrDFS.insert({ValueOp, IDFSNum});
-        const Expression *E = performSymbolicEvaluation(ValueOp, Visited);
-        InstrDFS.erase(ValueOp);
-        AllTempInstructions.erase(ValueOp);
+        // FIXME: For those things that are not safe we could generate
+        // expressions all the way down, and see if this comes out to a
+        // constant.  For anything where that is true, and unsafe, we should
+        // have made a phi-of-ops (or value numbered it equivalent to something)
+        // for the pieces already.
+        FoundVal = !SafeForPHIOfOps ? nullptr
+                                    : findLeaderForInst(ValueOp, Visited,
+                                                        MemAccess, I, PredBB);
         ValueOp->deleteValue();
-        if (MemAccess)
-          TempToMemory.erase(ValueOp);
-        if (!E)
+        if (!FoundVal)
           return nullptr;
-        FoundVal = findPhiOfOpsLeader(E, PredBB);
-        if (!FoundVal) {
-          ExpressionToPhiOfOps[E].insert(I);
-          return nullptr;
-        }
-        if (auto *SI = dyn_cast<StoreInst>(FoundVal))
-          FoundVal = SI->getValueOperand();
       } else {
         DEBUG(dbgs() << "Skipping phi of ops operand for incoming block "
                      << getBlockName(PredBB)
                      << " because the block is unreachable\n");
         FoundVal = UndefValue::get(I->getType());
+        RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I));
       }
 
       Ops.push_back({FoundVal, PredBB});
       DEBUG(dbgs() << "Found phi of ops operand " << *FoundVal << " in "
                    << getBlockName(PredBB) << "\n");
     }
+    for (auto Dep : Deps)
+      addAdditionalUsers(Dep, I);
+    sortPHIOps(Ops);
+    auto *E = performSymbolicPHIEvaluation(Ops, I, PHIBlock);
+    if (isa<ConstantExpression>(E) || isa<VariableExpression>(E)) {
+      DEBUG(dbgs()
+            << "Not creating real PHI of ops because it simplified to existing "
+               "value or constant\n");
+      return E;
+    }
     auto *ValuePHI = RealToTemp.lookup(I);
     bool NewPHI = false;
     if (!ValuePHI) {
-      ValuePHI = PHINode::Create(I->getType(), OpPHI->getNumOperands());
+      ValuePHI =
+          PHINode::Create(I->getType(), OpPHI->getNumOperands(), "phiofops");
       addPhiOfOps(ValuePHI, PHIBlock, I);
       NewPHI = true;
       NumGVNPHIOfOpsCreated++;
@@ -2553,10 +2826,11 @@ NewGVN::makePossiblePhiOfOps(Instruction *I,
         ++i;
       }
     }
-
+    RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I));
     DEBUG(dbgs() << "Created phi of ops " << *ValuePHI << " for " << *I
                  << "\n");
-    return performSymbolicEvaluation(ValuePHI, Visited);
+
+    return E;
   }
   return nullptr;
 }
@@ -2602,8 +2876,11 @@ void NewGVN::initializeCongruenceClasses(Function &F) {
         if (MD && isa<StoreInst>(MD->getMemoryInst()))
           TOPClass->incStoreCount();
       }
+
+    // FIXME: This is trying to discover which instructions are uses of phi
+    // nodes.  We should move this into one of the myriad of places that walk
+    // all the operands already.
     for (auto &I : *BB) {
-      // TODO: Move to helper
       if (isa<PHINode>(&I))
         for (auto *U : I.users())
           if (auto *UInst = dyn_cast<Instruction>(U))
@@ -2661,7 +2938,8 @@ void NewGVN::cleanupTables() {
   ExpressionToPhiOfOps.clear();
   TempToBlock.clear();
   TempToMemory.clear();
-  PHIOfOpsPHIs.clear();
+  PHINodeUses.clear();
+  OpSafeForPHIOfOps.clear();
   ReachableBlocks.clear();
   ReachableEdges.clear();
 #ifndef NDEBUG
@@ -2675,6 +2953,7 @@ void NewGVN::cleanupTables() {
   MemoryAccessToClass.clear();
   PredicateToUsers.clear();
   MemoryToUsers.clear();
+  RevisitOnReachabilityChange.clear();
 }
 
 // Assign local DFS number mapping to instructions, and leave space for Value
@@ -2698,6 +2977,8 @@ std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
       markInstructionForDeletion(&I);
       continue;
     }
+    if (isa<PHINode>(&I))
+      RevisitOnReachabilityChange[B].set(End);
     InstrDFS[&I] = End++;
     DFSToInstr.emplace_back(&I);
   }
@@ -2719,6 +3000,7 @@ void NewGVN::updateProcessedCount(const Value *V) {
   }
 #endif
 }
+
 // Evaluate MemoryPhi nodes symbolically, just like PHI nodes
 void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
   // If all the arguments are the same, the MemoryPhi has the same value as the
@@ -2787,11 +3069,15 @@ void NewGVN::valueNumberInstruction(Instruction *I) {
       // Make a phi of ops if necessary
       if (Symbolized && !isa<ConstantExpression>(Symbolized) &&
           !isa<VariableExpression>(Symbolized) && PHINodeUses.count(I)) {
-        auto *PHIE = makePossiblePhiOfOps(I, Visited);
-        if (PHIE)
+        auto *PHIE = makePossiblePHIOfOps(I, Visited);
+        // If we created a phi of ops, use it.
+        // If we couldn't create one, make sure we don't leave one lying around
+        if (PHIE) {
           Symbolized = PHIE;
+        } else if (auto *Op = RealToTemp.lookup(I)) {
+          removePhiOfOps(I, Op);
+        }
       }
-
     } else {
       // Mark the instruction as unused so we don't value number it again.
       InstrDFS[I] = 0;
@@ -2905,7 +3191,7 @@ void NewGVN::verifyMemoryCongruency() const {
         // so we don't process them.
         if (auto *MemPHI = dyn_cast<MemoryPhi>(Pair.first)) {
           for (auto &U : MemPHI->incoming_values()) {
-            if (Instruction *I = dyn_cast<Instruction>(U.get())) {
+            if (auto *I = dyn_cast<Instruction>(&*U)) {
               if (!isInstructionTriviallyDead(I))
                 return true;
             }
@@ -3200,11 +3486,13 @@ struct NewGVN::ValueDFS {
   int DFSIn = 0;
   int DFSOut = 0;
   int LocalNum = 0;
+
   // Only one of Def and U will be set.
   // The bool in the Def tells us whether the Def is the stored value of a
   // store.
   PointerIntPair<Value *, 1, bool> Def;
   Use *U = nullptr;
+
   bool operator<(const ValueDFS &Other) const {
     // It's not enough that any given field be less than - we have sets
     // of fields that need to be evaluated together to give a proper ordering.
@@ -3439,7 +3727,6 @@ void NewGVN::markInstructionForDeletion(Instruction *I) {
 }
 
 void NewGVN::replaceInstruction(Instruction *I, Value *V) {
-
   DEBUG(dbgs() << "Replacing " << *I << " with " << *V << "\n");
   patchAndReplaceAllUsesWith(I, V);
   // We save the actual erasing to avoid invalidating memory
@@ -3460,7 +3747,9 @@ public:
     ValueStack.emplace_back(V);
     DFSStack.emplace_back(DFSIn, DFSOut);
   }
+
   bool empty() const { return DFSStack.empty(); }
+
   bool isInScope(int DFSIn, int DFSOut) const {
     if (empty())
       return false;
@@ -3484,19 +3773,33 @@ private:
   SmallVector<Value *, 8> ValueStack;
   SmallVector<std::pair<int, int>, 8> DFSStack;
 };
+
+} // end anonymous namespace
+
+// Given an expression, get the congruence class for it.
+CongruenceClass *NewGVN::getClassForExpression(const Expression *E) const {
+  if (auto *VE = dyn_cast<VariableExpression>(E))
+    return ValueToClass.lookup(VE->getVariableValue());
+  else if (isa<DeadExpression>(E))
+    return TOPClass;
+  return ExpressionToClass.lookup(E);
 }
 
 // Given a value and a basic block we are trying to see if it is available in,
 // see if the value has a leader available in that block.
-Value *NewGVN::findPhiOfOpsLeader(const Expression *E,
+Value *NewGVN::findPHIOfOpsLeader(const Expression *E,
+                                  const Instruction *OrigInst,
                                   const BasicBlock *BB) const {
   // It would already be constant if we could make it constant
   if (auto *CE = dyn_cast<ConstantExpression>(E))
     return CE->getConstantValue();
-  if (auto *VE = dyn_cast<VariableExpression>(E))
-    return VE->getVariableValue();
+  if (auto *VE = dyn_cast<VariableExpression>(E)) {
+    auto *V = VE->getVariableValue();
+    if (alwaysAvailable(V) || DT->dominates(getBlockForValue(V), BB))
+      return VE->getVariableValue();
+  }
 
-  auto *CC = ExpressionToClass.lookup(E);
+  auto *CC = getClassForExpression(E);
   if (!CC)
     return nullptr;
   if (alwaysAvailable(CC->getLeader()))
@@ -3504,15 +3807,13 @@ Value *NewGVN::findPhiOfOpsLeader(const Expression *E,
 
   for (auto Member : *CC) {
     auto *MemberInst = dyn_cast<Instruction>(Member);
+    if (MemberInst == OrigInst)
+      continue;
     // Anything that isn't an instruction is always available.
     if (!MemberInst)
       return Member;
-    // If we are looking for something in the same block as the member, it must
-    // be a leader because this function is looking for operands for a phi node.
-    if (MemberInst->getParent() == BB ||
-        DT->dominates(MemberInst->getParent(), BB)) {
+    if (DT->dominates(getBlockForValue(MemberInst), BB))
       return Member;
-    }
   }
   return nullptr;
 }
@@ -3549,36 +3850,39 @@ bool NewGVN::eliminateInstructions(Function &F) {
 
   // Go through all of our phi nodes, and kill the arguments associated with
   // unreachable edges.
-  auto ReplaceUnreachablePHIArgs = [&](PHINode &PHI, BasicBlock *BB) {
-    for (auto &Operand : PHI.incoming_values())
-      if (!ReachableEdges.count({PHI.getIncomingBlock(Operand), BB})) {
+  auto ReplaceUnreachablePHIArgs = [&](PHINode *PHI, BasicBlock *BB) {
+    for (auto &Operand : PHI->incoming_values())
+      if (!ReachableEdges.count({PHI->getIncomingBlock(Operand), BB})) {
         DEBUG(dbgs() << "Replacing incoming value of " << PHI << " for block "
-                     << getBlockName(PHI.getIncomingBlock(Operand))
+                     << getBlockName(PHI->getIncomingBlock(Operand))
                      << " with undef due to it being unreachable\n");
-        Operand.set(UndefValue::get(PHI.getType()));
+        Operand.set(UndefValue::get(PHI->getType()));
       }
   };
-  SmallPtrSet<BasicBlock *, 8> BlocksWithPhis;
-  for (auto &B : F)
-    if ((!B.empty() && isa<PHINode>(*B.begin())) ||
-        (PHIOfOpsPHIs.find(&B) != PHIOfOpsPHIs.end()))
-      BlocksWithPhis.insert(&B);
+  // Replace unreachable phi arguments.
+  // At this point, RevisitOnReachabilityChange only contains:
+  //
+  // 1. PHIs
+  // 2. Temporaries that will convert to PHIs
+  // 3. Operations that are affected by an unreachable edge but do not fit into
+  // 1 or 2 (rare).
+  // So it is a slight overshoot of what we want. We could make it exact by
+  // using two SparseBitVectors per block.
   DenseMap<const BasicBlock *, unsigned> ReachablePredCount;
-  for (auto KV : ReachableEdges)
+  for (auto &KV : ReachableEdges)
     ReachablePredCount[KV.getEnd()]++;
-  for (auto *BB : BlocksWithPhis)
-    // TODO: It would be faster to use getNumIncomingBlocks() on a phi node in
-    // the block and subtract the pred count, but it's more complicated.
-    if (ReachablePredCount.lookup(BB) !=
-        unsigned(std::distance(pred_begin(BB), pred_end(BB)))) {
-      for (auto II = BB->begin(); isa<PHINode>(II); ++II) {
-        auto &PHI = cast<PHINode>(*II);
+  for (auto &BBPair : RevisitOnReachabilityChange) {
+    for (auto InstNum : BBPair.second) {
+      auto *Inst = InstrFromDFSNum(InstNum);
+      auto *PHI = dyn_cast<PHINode>(Inst);
+      PHI = PHI ? PHI : dyn_cast_or_null<PHINode>(RealToTemp.lookup(Inst));
+      if (!PHI)
+        continue;
+      auto *BB = BBPair.first;
+      if (ReachablePredCount.lookup(BB) != PHI->getNumIncomingValues())
         ReplaceUnreachablePHIArgs(PHI, BB);
-      }
-      for_each_found(PHIOfOpsPHIs, BB, [&](PHINode *PHI) {
-        ReplaceUnreachablePHIArgs(*PHI, BB);
-      });
     }
+  }
 
   // Map to store the use counts
   DenseMap<const Value *, unsigned int> UseCounts;
@@ -3631,7 +3935,7 @@ bool NewGVN::eliminateInstructions(Function &F) {
       CC->swap(MembersLeft);
     } else {
       // If this is a singleton, we can skip it.
-      if (CC->size() != 1 || RealToTemp.lookup(Leader)) {
+      if (CC->size() != 1 || RealToTemp.count(Leader)) {
         // This is a stack because equality replacement/etc may place
         // constants in the middle of the member list, and we want to use
         // those constant values in preference to the current leader, over
@@ -3873,12 +4177,16 @@ bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const {
 }
 
 namespace {
+
 class NewGVNLegacyPass : public FunctionPass {
 public:
-  static char ID; // Pass identification, replacement for typeid.
+  // Pass identification, replacement for typeid.
+  static char ID;
+
   NewGVNLegacyPass() : FunctionPass(ID) {
     initializeNewGVNLegacyPassPass(*PassRegistry::getPassRegistry());
   }
+
   bool runOnFunction(Function &F) override;
 
 private:
@@ -3892,7 +4200,8 @@ private:
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
 };
-} // namespace
+
+} // end anonymous namespace
 
 bool NewGVNLegacyPass::runOnFunction(Function &F) {
   if (skipFunction(F))
@@ -3906,6 +4215,8 @@ bool NewGVNLegacyPass::runOnFunction(Function &F) {
       .runGVN();
 }
 
+char NewGVNLegacyPass::ID = 0;
+
 INITIALIZE_PASS_BEGIN(NewGVNLegacyPass, "newgvn", "Global Value Numbering",
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
@@ -3917,8 +4228,6 @@ INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
 INITIALIZE_PASS_END(NewGVNLegacyPass, "newgvn", "Global Value Numbering", false,
                     false)
 
-char NewGVNLegacyPass::ID = 0;
-
 // createGVNPass - The public interface to this file.
 FunctionPass *llvm::createNewGVNPass() { return new NewGVNLegacyPass(); }
 
diff --git a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
index 1bfecea2f61e..1748815c5941 100644
--- a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
+++ b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -26,16 +26,13 @@ using namespace llvm;
 
 
 static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
-                         BasicBlock &CurrBB, Function::iterator &BB) {
+                         BasicBlock &CurrBB, Function::iterator &BB,
+                         const TargetTransformInfo *TTI) {
   // There is no need to change the IR, since backend will emit sqrt
   // instruction if the call has already been marked read-only.
   if (Call->onlyReadsMemory())
     return false;
 
-  // The call must have the expected result type.
-  if (!Call->getType()->isFloatingPointTy())
-    return false;
-
   // Do the following transformation:
   //
   // (before)
@@ -43,7 +40,7 @@ static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
   //
   // (after)
   // v0 = sqrt_noreadmem(src) # native sqrt instruction.
-  // if (v0 is a NaN)
+  // [if (v0 is a NaN) || if (src < 0)]
   //   v1 = sqrt(src)         # library call.
   // dst = phi(v0, v1)
   //
@@ -52,7 +49,8 @@ static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
   // Create phi and replace all uses.
   BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode());
   IRBuilder<> Builder(JoinBB, JoinBB->begin());
-  PHINode *Phi = Builder.CreatePHI(Call->getType(), 2);
+  Type *Ty = Call->getType();
+  PHINode *Phi = Builder.CreatePHI(Ty, 2);
   Call->replaceAllUsesWith(Phi);
 
   // Create basic block LibCallBB and insert a call to library function sqrt.
@@ -69,7 +67,10 @@ static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
   Call->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
   CurrBB.getTerminator()->eraseFromParent();
   Builder.SetInsertPoint(&CurrBB);
-  Value *FCmp = Builder.CreateFCmpOEQ(Call, Call);
+  Value *FCmp = TTI->isFCmpOrdCheaperThanFCmpZero(Ty)
+                    ? Builder.CreateFCmpORD(Call, Call)
+                    : Builder.CreateFCmpOGE(Call->getOperand(0),
+                                            ConstantFP::get(Ty, 0.0));
   Builder.CreateCondBr(FCmp, JoinBB, LibCallBB);
 
   // Add phi operands.
@@ -96,18 +97,21 @@ static bool runPartiallyInlineLibCalls(Function &F, TargetLibraryInfo *TLI,
       if (!Call || !(CalledFunc = Call->getCalledFunction()))
         continue;
 
+      if (Call->isNoBuiltin())
+        continue;
+
       // Skip if function either has local linkage or is not a known library
       // function.
       LibFunc LF;
-      if (CalledFunc->hasLocalLinkage() || !CalledFunc->hasName() ||
-          !TLI->getLibFunc(CalledFunc->getName(), LF))
+      if (CalledFunc->hasLocalLinkage() ||
+          !TLI->getLibFunc(*CalledFunc, LF) || !TLI->has(LF))
         continue;
 
       switch (LF) {
       case LibFunc_sqrtf:
       case LibFunc_sqrt:
         if (TTI->haveFastSqrt(Call->getType()) &&
-            optimizeSQRT(Call, CalledFunc, *CurrBB, BB))
+            optimizeSQRT(Call, CalledFunc, *CurrBB, BB, TTI))
           break;
         continue;
       default:
diff --git a/lib/Transforms/Scalar/PlaceSafepoints.cpp b/lib/Transforms/Scalar/PlaceSafepoints.cpp
index e47b636348e3..2d0cb6fbf211 100644
--- a/lib/Transforms/Scalar/PlaceSafepoints.cpp
+++ b/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -54,6 +54,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -113,6 +114,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass {
   ScalarEvolution *SE = nullptr;
   DominatorTree *DT = nullptr;
   LoopInfo *LI = nullptr;
+  TargetLibraryInfo *TLI = nullptr;
 
   PlaceBackedgeSafepointsImpl(bool CallSafepoints = false)
       : FunctionPass(ID), CallSafepointsEnabled(CallSafepoints) {
@@ -131,6 +133,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass {
     SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
     for (Loop *I : *LI) {
       runOnLoopAndSubLoops(I);
     }
@@ -141,6 +144,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass {
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<ScalarEvolutionWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
     // We no longer modify the IR at all in this pass.  Thus all
     // analysis are preserved.
     AU.setPreservesAll();
@@ -165,6 +169,7 @@ struct PlaceSafepoints : public FunctionPass {
     // We modify the graph wholesale (inlining, block insertion, etc).  We
     // preserve nothing at the moment.  We could potentially preserve dom tree
     // if that was worth doing
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
   }
 };
 }
@@ -174,10 +179,11 @@ struct PlaceSafepoints : public FunctionPass {
 // callers job.
 static void
 InsertSafepointPoll(Instruction *InsertBefore,
-                    std::vector<CallSite> &ParsePointsNeeded /*rval*/);
+                    std::vector<CallSite> &ParsePointsNeeded /*rval*/,
+                    const TargetLibraryInfo &TLI);
 
-static bool needsStatepoint(const CallSite &CS) {
-  if (callsGCLeafFunction(CS))
+static bool needsStatepoint(const CallSite &CS, const TargetLibraryInfo &TLI) {
+  if (callsGCLeafFunction(CS, TLI))
     return false;
   if (CS.isCall()) {
     CallInst *call = cast<CallInst>(CS.getInstruction());
@@ -194,7 +200,8 @@ static bool needsStatepoint(const CallSite &CS) {
 /// answer; i.e. false is always valid.
 static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header,
                                                BasicBlock *Pred,
-                                               DominatorTree &DT) {
+                                               DominatorTree &DT,
+                                               const TargetLibraryInfo &TLI) {
   // In general, we're looking for any cut of the graph which ensures
   // there's a call safepoint along every edge between Header and Pred.
   // For the moment, we look only for the 'cuts' that consist of a single call
@@ -217,7 +224,7 @@ static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header,
         // unconditional poll. In practice, this is only a theoretical concern
         // since we don't have any methods with conditional-only safepoint
         // polls.
-        if (needsStatepoint(CS))
+        if (needsStatepoint(CS, TLI))
           return true;
     }
 
@@ -321,7 +328,7 @@ bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) {
         continue;
       }
       if (CallSafepointsEnabled &&
-          containsUnconditionalCallSafepoint(L, Header, Pred, *DT)) {
+          containsUnconditionalCallSafepoint(L, Header, Pred, *DT, *TLI)) {
         // Note: This is only semantically legal since we won't do any further
         // IPO or inlining before the actual call insertion..  If we hadn't, we
         // might latter loose this call safepoint.
@@ -472,6 +479,9 @@ bool PlaceSafepoints::runOnFunction(Function &F) {
   if (!shouldRewriteFunction(F))
     return false;
 
+  const TargetLibraryInfo &TLI =
+      getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
   bool Modified = false;
 
   // In various bits below, we rely on the fact that uses are reachable from
@@ -578,7 +588,7 @@ bool PlaceSafepoints::runOnFunction(Function &F) {
   // safepoint polls themselves.
   for (Instruction *PollLocation : PollsNeeded) {
     std::vector<CallSite> RuntimeCalls;
-    InsertSafepointPoll(PollLocation, RuntimeCalls);
+    InsertSafepointPoll(PollLocation, RuntimeCalls, TLI);
     ParsePointNeeded.insert(ParsePointNeeded.end(), RuntimeCalls.begin(),
                             RuntimeCalls.end());
   }
@@ -610,7 +620,8 @@ INITIALIZE_PASS_END(PlaceSafepoints, "place-safepoints", "Place Safepoints",
 
 static void
 InsertSafepointPoll(Instruction *InsertBefore,
-                    std::vector<CallSite> &ParsePointsNeeded /*rval*/) {
+                    std::vector<CallSite> &ParsePointsNeeded /*rval*/,
+                    const TargetLibraryInfo &TLI) {
   BasicBlock *OrigBB = InsertBefore->getParent();
   Module *M = InsertBefore->getModule();
   assert(M && "must be part of a module");
@@ -669,7 +680,7 @@ InsertSafepointPoll(Instruction *InsertBefore,
   assert(ParsePointsNeeded.empty());
   for (auto *CI : Calls) {
     // No safepoint needed or wanted
-    if (!needsStatepoint(CI))
+    if (!needsStatepoint(CI, TLI))
       continue;
 
     // These are likely runtime calls.  Should we assert that via calling
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index e235e5eb1a06..88dcaf0f8a36 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -21,28 +21,45 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/Reassociate.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
+#include <cassert>
+#include <utility>
+
 using namespace llvm;
 using namespace reassociate;
 
@@ -54,7 +71,6 @@ STATISTIC(NumFactor , "Number of multiplies factored");
 
 #ifndef NDEBUG
 /// Print out the expression identified in the Ops list.
-///
 static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) {
   Module *M = I->getModule();
   dbgs() << Instruction::getOpcodeName(I->getOpcode()) << " "
@@ -128,38 +144,37 @@ XorOpnd::XorOpnd(Value *V) {
 /// Return true if V is an instruction of the specified opcode and if it
 /// only has one use.
 static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
-  if (V->hasOneUse() && isa<Instruction>(V) &&
-      cast<Instruction>(V)->getOpcode() == Opcode &&
-      (!isa<FPMathOperator>(V) ||
-       cast<Instruction>(V)->hasUnsafeAlgebra()))
-    return cast<BinaryOperator>(V);
+  auto *I = dyn_cast<Instruction>(V);
+  if (I && I->hasOneUse() && I->getOpcode() == Opcode)
+    if (!isa<FPMathOperator>(I) || I->isFast())
+      return cast<BinaryOperator>(I);
   return nullptr;
 }
 
 static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1,
                                         unsigned Opcode2) {
-  if (V->hasOneUse() && isa<Instruction>(V) &&
-      (cast<Instruction>(V)->getOpcode() == Opcode1 ||
-       cast<Instruction>(V)->getOpcode() == Opcode2) &&
-      (!isa<FPMathOperator>(V) ||
-       cast<Instruction>(V)->hasUnsafeAlgebra()))
-    return cast<BinaryOperator>(V);
+  auto *I = dyn_cast<Instruction>(V);
+  if (I && I->hasOneUse() &&
+      (I->getOpcode() == Opcode1 || I->getOpcode() == Opcode2))
+    if (!isa<FPMathOperator>(I) || I->isFast())
+      return cast<BinaryOperator>(I);
   return nullptr;
 }
 
 void ReassociatePass::BuildRankMap(Function &F,
                                    ReversePostOrderTraversal<Function*> &RPOT) {
-  unsigned i = 2;
+  unsigned Rank = 2;
 
   // Assign distinct ranks to function arguments.
-  for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
-    ValueRankMap[&*I] = ++i;
-    DEBUG(dbgs() << "Calculated Rank[" << I->getName() << "] = " << i << "\n");
+  for (auto &Arg : F.args()) {
+    ValueRankMap[&Arg] = ++Rank;
+    DEBUG(dbgs() << "Calculated Rank[" << Arg.getName() << "] = " << Rank
+                 << "\n");
   }
 
   // Traverse basic blocks in ReversePostOrder
   for (BasicBlock *BB : RPOT) {
-    unsigned BBRank = RankMap[BB] = ++i << 16;
+    unsigned BBRank = RankMap[BB] = ++Rank << 16;
 
     // Walk the basic block, adding precomputed ranks for any instructions that
     // we cannot move.  This ensures that the ranks for these instructions are
@@ -207,13 +222,9 @@ void ReassociatePass::canonicalizeOperands(Instruction *I) {
 
   Value *LHS = I->getOperand(0);
   Value *RHS = I->getOperand(1);
-  unsigned LHSRank = getRank(LHS);
-  unsigned RHSRank = getRank(RHS);
-
-  if (isa<Constant>(RHS))
+  if (LHS == RHS || isa<Constant>(RHS))
     return;
-
-  if (isa<Constant>(LHS) || RHSRank < LHSRank)
+  if (isa<Constant>(LHS) || getRank(RHS) < getRank(LHS))
     cast<BinaryOperator>(I)->swapOperands();
 }
 
@@ -357,7 +368,7 @@ static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) {
   }
 }
 
-typedef std::pair<Value*, APInt> RepeatedValue;
+using RepeatedValue = std::pair<Value*, APInt>;
 
 /// Given an associative binary expression, return the leaf
 /// nodes in Ops along with their weights (how many times the leaf occurs).  The
@@ -432,7 +443,6 @@ typedef std::pair<Value*, APInt> RepeatedValue;
 /// that have all uses inside the expression (i.e. only used by non-leaf nodes
 /// of the expression) if it can turn them into binary operators of the right
 /// type and thus make the expression bigger.
-
 static bool LinearizeExprTree(BinaryOperator *I,
                               SmallVectorImpl<RepeatedValue> &Ops) {
   DEBUG(dbgs() << "LINEARIZE: " << *I << '\n');
@@ -470,12 +480,12 @@ static bool LinearizeExprTree(BinaryOperator *I,
 
   // Leaves - Keeps track of the set of putative leaves as well as the number of
   // paths to each leaf seen so far.
-  typedef DenseMap<Value*, APInt> LeafMap;
+  using LeafMap = DenseMap<Value *, APInt>;
   LeafMap Leaves; // Leaf -> Total weight so far.
-  SmallVector<Value*, 8> LeafOrder; // Ensure deterministic leaf output order.
+  SmallVector<Value *, 8> LeafOrder; // Ensure deterministic leaf output order.
 
 #ifndef NDEBUG
-  SmallPtrSet<Value*, 8> Visited; // For sanity checking the iteration scheme.
+  SmallPtrSet<Value *, 8> Visited; // For sanity checking the iteration scheme.
 #endif
   while (!Worklist.empty()) {
     std::pair<BinaryOperator*, APInt> P = Worklist.pop_back_val();
@@ -554,7 +564,7 @@ static bool LinearizeExprTree(BinaryOperator *I,
       assert((!isa<Instruction>(Op) ||
               cast<Instruction>(Op)->getOpcode() != Opcode
               || (isa<FPMathOperator>(Op) &&
-                  !cast<Instruction>(Op)->hasUnsafeAlgebra())) &&
+                  !cast<Instruction>(Op)->isFast())) &&
              "Should have been handled above!");
       assert(Op->hasOneUse() && "Has uses outside the expression tree!");
 
@@ -773,7 +783,7 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
         break;
       ExpressionChanged->moveBefore(I);
       ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->user_begin());
-    } while (1);
+    } while (true);
 
   // Throw away any left over nodes from the original expression.
   for (unsigned i = 0, e = NodesToRewrite.size(); i != e; ++i)
@@ -789,13 +799,9 @@ void ReassociatePass::RewriteExprTree(BinaryOperator *I,
 /// additional opportunities have been exposed.
 static Value *NegateValue(Value *V, Instruction *BI,
                           SetVector<AssertingVH<Instruction>> &ToRedo) {
-  if (Constant *C = dyn_cast<Constant>(V)) {
-    if (C->getType()->isFPOrFPVectorTy()) {
-      return ConstantExpr::getFNeg(C);
-    }
-    return ConstantExpr::getNeg(C);
-  }
-
+  if (auto *C = dyn_cast<Constant>(V))
+    return C->getType()->isFPOrFPVectorTy() ? ConstantExpr::getFNeg(C) :
+                                              ConstantExpr::getNeg(C);
 
   // We are trying to expose opportunity for reassociation.  One of the things
   // that we want to do to achieve this is to push a negation as deep into an
@@ -913,7 +919,6 @@ BreakUpSubtract(Instruction *Sub, SetVector<AssertingVH<Instruction>> &ToRedo) {
   //
   // Calculate the negative value of Operand 1 of the sub instruction,
   // and set it as the RHS of the add instruction we just made.
-  //
   Value *NegVal = NegateValue(Sub->getOperand(1), Sub, ToRedo);
   BinaryOperator *New = CreateAdd(Sub->getOperand(0), NegVal, "", Sub, Sub);
   Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op.
@@ -990,7 +995,7 @@ static Value *EmitAddTreeOfValues(Instruction *I,
   Value *V1 = Ops.back();
   Ops.pop_back();
   Value *V2 = EmitAddTreeOfValues(I, Ops);
-  return CreateAdd(V2, V1, "tmp", I, I);
+  return CreateAdd(V2, V1, "reass.add", I, I);
 }
 
 /// If V is an expression tree that is a multiplication sequence,
@@ -1157,7 +1162,6 @@ static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd,
 // If it was successful, true is returned, and the "R" and "C" is returned
 // via "Res" and "ConstOpnd", respectively; otherwise, false is returned,
 // and both "Res" and "ConstOpnd" remain unchanged.
-//
 bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
                                      APInt &ConstOpnd, Value *&Res) {
   // Xor-Rule 1: (x | c1) ^ c2 = (x | c1) ^ (c1 ^ c1) ^ c2 
@@ -1183,7 +1187,6 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
     RedoInsts.insert(T);
   return true;
 }
-
                            
 // Helper function of OptimizeXor(). It tries to simplify
 // "Opnd1 ^ Opnd2 ^ ConstOpnd" into "R ^ C", where C would be 0, and R is a
@@ -1230,7 +1233,6 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
 
     Res = createAndInstr(I, X, C3);
     ConstOpnd ^= C1;
-
   } else if (Opnd1->isOrExpr()) {
     // Xor-Rule 3: (x | c1) ^ (x | c2) = (x & c3) ^ c3 where c3 = c1 ^ c2
     //
@@ -1349,7 +1351,6 @@ Value *ReassociatePass::OptimizeXor(Instruction *I,
 
     // step 3.2: When previous and current operands share the same symbolic
     //  value, try to simplify "PrevOpnd ^ CurrOpnd ^ ConstOpnd" 
-    //    
     if (CombineXorOpnd(I, CurrOpnd, PrevOpnd, ConstOpnd, CV)) {
       // Remove previous operand
       PrevOpnd->Invalidate();
@@ -1601,7 +1602,7 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
       RedoInsts.insert(VI);
 
     // Create the multiply.
-    Instruction *V2 = CreateMul(V, MaxOccVal, "tmp", I, I);
+    Instruction *V2 = CreateMul(V, MaxOccVal, "reass.mul", I, I);
 
     // Rerun associate on the multiply in case the inner expression turned into
     // a multiply.  We want to make sure that we keep things in canonical form.
@@ -2012,8 +2013,8 @@ void ReassociatePass::OptimizeInst(Instruction *I) {
   if (I->isCommutative())
     canonicalizeOperands(I);
 
-  // Don't optimize floating point instructions that don't have unsafe algebra.
-  if (I->getType()->isFPOrFPVectorTy() && !I->hasUnsafeAlgebra())
+  // Don't optimize floating-point instructions unless they are 'fast'.
+  if (I->getType()->isFPOrFPVectorTy() && !I->isFast())
     return;
 
   // Do not reassociate boolean (i1) expressions.  We want to preserve the
@@ -2140,7 +2141,8 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
     DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n');
     I->replaceAllUsesWith(V);
     if (Instruction *VI = dyn_cast<Instruction>(V))
-      VI->setDebugLoc(I->getDebugLoc());
+      if (I->getDebugLoc())
+        VI->setDebugLoc(I->getDebugLoc());
     RedoInsts.insert(I);
     ++NumAnnihil;
     return;
@@ -2183,11 +2185,104 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
     return;
   }
 
+  if (Ops.size() > 2 && Ops.size() <= GlobalReassociateLimit) {
+    // Find the pair with the highest count in the pairmap and move it to the
+    // back of the list so that it can later be CSE'd.
+    // example:
+    //   a*b*c*d*e
+    // if c*e is the most "popular" pair, we can express this as
+    //   (((c*e)*d)*b)*a
+    unsigned Max = 1;
+    unsigned BestRank = 0;
+    std::pair<unsigned, unsigned> BestPair;
+    unsigned Idx = I->getOpcode() - Instruction::BinaryOpsBegin;
+    for (unsigned i = 0; i < Ops.size() - 1; ++i)
+      for (unsigned j = i + 1; j < Ops.size(); ++j) {
+        unsigned Score = 0;
+        Value *Op0 = Ops[i].Op;
+        Value *Op1 = Ops[j].Op;
+        if (std::less<Value *>()(Op1, Op0))
+          std::swap(Op0, Op1);
+        auto it = PairMap[Idx].find({Op0, Op1});
+        if (it != PairMap[Idx].end())
+          Score += it->second;
+
+        unsigned MaxRank = std::max(Ops[i].Rank, Ops[j].Rank);
+        if (Score > Max || (Score == Max && MaxRank < BestRank)) {
+          BestPair = {i, j};
+          Max = Score;
+          BestRank = MaxRank;
+        }
+      }
+    if (Max > 1) {
+      auto Op0 = Ops[BestPair.first];
+      auto Op1 = Ops[BestPair.second];
+      Ops.erase(&Ops[BestPair.second]);
+      Ops.erase(&Ops[BestPair.first]);
+      Ops.push_back(Op0);
+      Ops.push_back(Op1);
+    }
+  }
   // Now that we ordered and optimized the expressions, splat them back into
   // the expression tree, removing any unneeded nodes.
   RewriteExprTree(I, Ops);
 }
 
+void
+ReassociatePass::BuildPairMap(ReversePostOrderTraversal<Function *> &RPOT) {
+  // Make a "pairmap" of how often each operand pair occurs.
+  for (BasicBlock *BI : RPOT) {
+    for (Instruction &I : *BI) {
+      if (!I.isAssociative())
+        continue;
+
+      // Ignore nodes that aren't at the root of trees.
+      if (I.hasOneUse() && I.user_back()->getOpcode() == I.getOpcode())
+        continue;
+
+      // Collect all operands in a single reassociable expression.
+      // Since Reassociate has already been run once, we can assume things
+      // are already canonical according to Reassociation's regime.
+      SmallVector<Value *, 8> Worklist = { I.getOperand(0), I.getOperand(1) };
+      SmallVector<Value *, 8> Ops;
+      while (!Worklist.empty() && Ops.size() <= GlobalReassociateLimit) {
+        Value *Op = Worklist.pop_back_val();
+        Instruction *OpI = dyn_cast<Instruction>(Op);
+        if (!OpI || OpI->getOpcode() != I.getOpcode() || !OpI->hasOneUse()) {
+          Ops.push_back(Op);
+          continue;
+        }
+        // Be paranoid about self-referencing expressions in unreachable code.
+        if (OpI->getOperand(0) != OpI)
+          Worklist.push_back(OpI->getOperand(0));
+        if (OpI->getOperand(1) != OpI)
+          Worklist.push_back(OpI->getOperand(1));
+      }
+      // Skip extremely long expressions.
+      if (Ops.size() > GlobalReassociateLimit)
+        continue;
+
+      // Add all pairwise combinations of operands to the pair map.
+      unsigned BinaryIdx = I.getOpcode() - Instruction::BinaryOpsBegin;
+      SmallSet<std::pair<Value *, Value*>, 32> Visited;
+      for (unsigned i = 0; i < Ops.size() - 1; ++i) {
+        for (unsigned j = i + 1; j < Ops.size(); ++j) {
+          // Canonicalize operand orderings.
+          Value *Op0 = Ops[i];
+          Value *Op1 = Ops[j];
+          if (std::less<Value *>()(Op1, Op0))
+            std::swap(Op0, Op1);
+          if (!Visited.insert({Op0, Op1}).second)
+            continue;
+          auto res = PairMap[BinaryIdx].insert({{Op0, Op1}, 1});
+          if (!res.second)
+            ++res.first->second;
+        }
+      }
+    }
+  }
+}
+
 PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
   // Get the functions basic blocks in Reverse Post Order. This order is used by
   // BuildRankMap to pre calculate ranks correctly. It also excludes dead basic
@@ -2198,8 +2293,20 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
   // Calculate the rank map for F.
   BuildRankMap(F, RPOT);
 
+  // Build the pair map before running reassociate.
+  // Technically this would be more accurate if we did it after one round
+  // of reassociation, but in practice it doesn't seem to help much on
+  // real-world code, so don't waste the compile time running reassociate
+  // twice.
+  // If a user wants, they could expicitly run reassociate twice in their
+  // pass pipeline for further potential gains.
+  // It might also be possible to update the pair map during runtime, but the
+  // overhead of that may be large if there's many reassociable chains.
+  BuildPairMap(RPOT);
+
   MadeChange = false;
-  // Traverse the same blocks that was analysed by BuildRankMap.
+
+  // Traverse the same blocks that were analysed by BuildRankMap.
   for (BasicBlock *BI : RPOT) {
     assert(RankMap.count(&*BI) && "BB should be ranked.");
     // Optimize every instruction in the basic block.
@@ -2238,9 +2345,11 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
     }
   }
 
-  // We are done with the rank map.
+  // We are done with the rank map and pair map.
   RankMap.clear();
   ValueRankMap.clear();
+  for (auto &Entry : PairMap)
+    Entry.clear();
 
   if (MadeChange) {
     PreservedAnalyses PA;
@@ -2253,10 +2362,13 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
 }
 
 namespace {
+
   class ReassociateLegacyPass : public FunctionPass {
     ReassociatePass Impl;
+
   public:
     static char ID; // Pass identification, replacement for typeid
+
     ReassociateLegacyPass() : FunctionPass(ID) {
       initializeReassociateLegacyPassPass(*PassRegistry::getPassRegistry());
     }
@@ -2275,9 +2387,11 @@ namespace {
       AU.addPreserved<GlobalsAAWrapperPass>();
     }
   };
-}
+
+} // end anonymous namespace
 
 char ReassociateLegacyPass::ID = 0;
+
 INITIALIZE_PASS(ReassociateLegacyPass, "reassociate",
                 "Reassociate expressions", false, false)
 
diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index f19d45329d23..3b45cfa482e6 100644
--- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -12,36 +12,69 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Scalar/RewriteStatepointsForGC.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/CFG.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
-#include "llvm/IR/Verifier.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
 
 #define DEBUG_TYPE "rewrite-statepoints-for-gc"
 
@@ -52,6 +85,7 @@ static cl::opt<bool> PrintLiveSet("spp-print-liveset", cl::Hidden,
                                   cl::init(false));
 static cl::opt<bool> PrintLiveSetSize("spp-print-liveset-size", cl::Hidden,
                                       cl::init(false));
+
 // Print out the base pointers for debugging
 static cl::opt<bool> PrintBasePointers("spp-print-base-pointers", cl::Hidden,
                                        cl::init(false));
@@ -67,6 +101,7 @@ static bool ClobberNonLive = true;
 #else
 static bool ClobberNonLive = false;
 #endif
+
 static cl::opt<bool, true> ClobberNonLiveOverride("rs4gc-clobber-non-live",
                                                   cl::location(ClobberNonLive),
                                                   cl::Hidden);
@@ -75,27 +110,96 @@ static cl::opt<bool>
     AllowStatepointWithNoDeoptInfo("rs4gc-allow-statepoint-with-no-deopt-info",
                                    cl::Hidden, cl::init(true));
 
+/// The IR fed into RewriteStatepointsForGC may have had attributes and
+/// metadata implying dereferenceability that are no longer valid/correct after
+/// RewriteStatepointsForGC has run. This is because semantically, after
+/// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire
+/// heap. stripNonValidData (conservatively) restores
+/// correctness by erasing all attributes in the module that externally imply
+/// dereferenceability. Similar reasoning also applies to the noalias
+/// attributes and metadata. gc.statepoint can touch the entire heap including
+/// noalias objects.
+/// Apart from attributes and metadata, we also remove instructions that imply
+/// constant physical memory: llvm.invariant.start.
+static void stripNonValidData(Module &M);
+
+static bool shouldRewriteStatepointsIn(Function &F);
+
+PreservedAnalyses RewriteStatepointsForGC::run(Module &M,
+                                               ModuleAnalysisManager &AM) {
+  bool Changed = false;
+  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  for (Function &F : M) {
+    // Nothing to do for declarations.
+    if (F.isDeclaration() || F.empty())
+      continue;
+
+    // Policy choice says not to rewrite - the most common reason is that we're
+    // compiling code without a GCStrategy.
+    if (!shouldRewriteStatepointsIn(F))
+      continue;
+
+    auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+    auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
+    auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
+    Changed |= runOnFunction(F, DT, TTI, TLI);
+  }
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  // stripNonValidData asserts that shouldRewriteStatepointsIn
+  // returns true for at least one function in the module.  Since at least
+  // one function changed, we know that the precondition is satisfied.
+  stripNonValidData(M);
+
+  PreservedAnalyses PA;
+  PA.preserve<TargetIRAnalysis>();
+  PA.preserve<TargetLibraryAnalysis>();
+  return PA;
+}
+
 namespace {
-struct RewriteStatepointsForGC : public ModulePass {
+
+class RewriteStatepointsForGCLegacyPass : public ModulePass {
+  RewriteStatepointsForGC Impl;
+
+public:
   static char ID; // Pass identification, replacement for typeid
 
-  RewriteStatepointsForGC() : ModulePass(ID) {
-    initializeRewriteStatepointsForGCPass(*PassRegistry::getPassRegistry());
+  RewriteStatepointsForGCLegacyPass() : ModulePass(ID), Impl() {
+    initializeRewriteStatepointsForGCLegacyPassPass(
+        *PassRegistry::getPassRegistry());
   }
-  bool runOnFunction(Function &F);
+
   bool runOnModule(Module &M) override {
     bool Changed = false;
-    for (Function &F : M)
-      Changed |= runOnFunction(F);
+    const TargetLibraryInfo &TLI =
+        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    for (Function &F : M) {
+      // Nothing to do for declarations.
+      if (F.isDeclaration() || F.empty())
+        continue;
+
+      // Policy choice says not to rewrite - the most common reason is that
+      // we're compiling code without a GCStrategy.
+      if (!shouldRewriteStatepointsIn(F))
+        continue;
 
-    if (Changed) {
-      // stripNonValidAttributesAndMetadata asserts that shouldRewriteStatepointsIn
-      // returns true for at least one function in the module.  Since at least
-      // one function changed, we know that the precondition is satisfied.
-      stripNonValidAttributesAndMetadata(M);
+      TargetTransformInfo &TTI =
+          getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+      auto &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+
+      Changed |= Impl.runOnFunction(F, DT, TTI, TLI);
     }
 
-    return Changed;
+    if (!Changed)
+      return false;
+
+    // stripNonValidData asserts that shouldRewriteStatepointsIn
+    // returns true for at least one function in the module.  Since at least
+    // one function changed, we know that the precondition is satisfied.
+    stripNonValidData(M);
+    return true;
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -103,46 +207,33 @@ struct RewriteStatepointsForGC : public ModulePass {
     // else.  We could in theory preserve a lot more analyses here.
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
   }
-
-  /// The IR fed into RewriteStatepointsForGC may have had attributes and
-  /// metadata implying dereferenceability that are no longer valid/correct after
-  /// RewriteStatepointsForGC has run. This is because semantically, after
-  /// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire
-  /// heap. stripNonValidAttributesAndMetadata (conservatively) restores
-  /// correctness by erasing all attributes in the module that externally imply
-  /// dereferenceability. Similar reasoning also applies to the noalias
-  /// attributes and metadata. gc.statepoint can touch the entire heap including
-  /// noalias objects.
-  void stripNonValidAttributesAndMetadata(Module &M);
-
-  // Helpers for stripNonValidAttributesAndMetadata
-  void stripNonValidAttributesAndMetadataFromBody(Function &F);
-  void stripNonValidAttributesFromPrototype(Function &F);
-  // Certain metadata on instructions are invalid after running RS4GC.
-  // Optimizations that run after RS4GC can incorrectly use this metadata to
-  // optimize functions. We drop such metadata on the instruction.
-  void stripInvalidMetadataFromInstruction(Instruction &I);
 };
-} // namespace
 
-char RewriteStatepointsForGC::ID = 0;
+} // end anonymous namespace
 
-ModulePass *llvm::createRewriteStatepointsForGCPass() {
-  return new RewriteStatepointsForGC();
+char RewriteStatepointsForGCLegacyPass::ID = 0;
+
+ModulePass *llvm::createRewriteStatepointsForGCLegacyPass() {
+  return new RewriteStatepointsForGCLegacyPass();
 }
 
-INITIALIZE_PASS_BEGIN(RewriteStatepointsForGC, "rewrite-statepoints-for-gc",
+INITIALIZE_PASS_BEGIN(RewriteStatepointsForGCLegacyPass,
+                      "rewrite-statepoints-for-gc",
                       "Make relocations explicit at statepoints", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(RewriteStatepointsForGC, "rewrite-statepoints-for-gc",
+INITIALIZE_PASS_END(RewriteStatepointsForGCLegacyPass,
+                    "rewrite-statepoints-for-gc",
                     "Make relocations explicit at statepoints", false, false)
 
 namespace {
+
 struct GCPtrLivenessData {
   /// Values defined in this block.
   MapVector<BasicBlock *, SetVector<Value *>> KillSet;
+
   /// Values used in this block (and thus live); does not included values
   /// killed within this block.
   MapVector<BasicBlock *, SetVector<Value *>> LiveSet;
@@ -166,10 +257,10 @@ struct GCPtrLivenessData {
 // Generally, after the execution of a full findBasePointer call, only the
 // base relation will remain.  Internally, we add a mixture of the two
 // types, then update all the second type to the first type
-typedef MapVector<Value *, Value *> DefiningValueMapTy;
-typedef SetVector<Value *> StatepointLiveSetTy;
-typedef MapVector<AssertingVH<Instruction>, AssertingVH<Value>>
-  RematerializedValueMapTy;
+using DefiningValueMapTy = MapVector<Value *, Value *>;
+using StatepointLiveSetTy = SetVector<Value *>;
+using RematerializedValueMapTy =
+    MapVector<AssertingVH<Instruction>, AssertingVH<Value>>;
 
 struct PartiallyConstructedSafepointRecord {
   /// The set of values known to be live across this safepoint
@@ -191,7 +282,8 @@ struct PartiallyConstructedSafepointRecord {
   /// Maps rematerialized copy to it's original value.
   RematerializedValueMapTy RematerializedValues;
 };
-}
+
+} // end anonymous namespace
 
 static ArrayRef<Use> GetDeoptBundleOperands(ImmutableCallSite CS) {
   Optional<OperandBundleUse> DeoptBundle =
@@ -254,7 +346,7 @@ static bool containsGCPtrType(Type *Ty) {
   if (ArrayType *AT = dyn_cast<ArrayType>(Ty))
     return containsGCPtrType(AT->getElementType());
   if (StructType *ST = dyn_cast<StructType>(Ty))
-    return any_of(ST->subtypes(), containsGCPtrType);
+    return llvm::any_of(ST->subtypes(), containsGCPtrType);
   return false;
 }
 
@@ -299,7 +391,9 @@ analyzeParsePointLiveness(DominatorTree &DT,
 }
 
 static bool isKnownBaseResult(Value *V);
+
 namespace {
+
 /// A single base defining value - An immediate base defining value for an
 /// instruction 'Def' is an input to 'Def' whose base is also a base of 'Def'.
 /// For instructions which have multiple pointer [vector] inputs or that
@@ -311,9 +405,11 @@ namespace {
 struct BaseDefiningValueResult {
   /// Contains the value which is the base defining value.
   Value * const BDV;
+
   /// True if the base defining value is also known to be an actual base
   /// pointer.
   const bool IsKnownBase;
+
   BaseDefiningValueResult(Value *BDV, bool IsKnownBase)
     : BDV(BDV), IsKnownBase(IsKnownBase) {
 #ifndef NDEBUG
@@ -324,7 +420,8 @@ struct BaseDefiningValueResult {
 #endif
   }
 };
-}
+
+} // end anonymous namespace
 
 static BaseDefiningValueResult findBaseDefiningValue(Value *I);
 
@@ -374,6 +471,11 @@ findBaseDefiningValueOfVector(Value *I) {
   if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
     return findBaseDefiningValue(GEP->getPointerOperand());
 
+  // If the pointer comes through a bitcast of a vector of pointers to
+  // a vector of another type of pointer, then look through the bitcast
+  if (auto *BC = dyn_cast<BitCastInst>(I))
+    return findBaseDefiningValue(BC->getOperand(0));
+
   // A PHI or Select is a base defining value.  The outer findBasePointer
   // algorithm is responsible for constructing a base value for this BDV.
   assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&
@@ -429,7 +531,6 @@ static BaseDefiningValueResult findBaseDefiningValue(Value *I) {
   if (isa<LoadInst>(I))
     // The value loaded is an gc base itself
     return BaseDefiningValueResult(I, true);
-  
 
   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I))
     // The base of this GEP is the base
@@ -442,12 +543,11 @@ static BaseDefiningValueResult findBaseDefiningValue(Value *I) {
       break;
     case Intrinsic::experimental_gc_statepoint:
       llvm_unreachable("statepoints don't produce pointers");
-    case Intrinsic::experimental_gc_relocate: {
+    case Intrinsic::experimental_gc_relocate:
       // Rerunning safepoint insertion after safepoints are already
       // inserted is not supported.  It could probably be made to work,
       // but why are you doing this?  There's no good reason.
       llvm_unreachable("repeat safepoint insertion is not supported");
-    }
     case Intrinsic::gcroot:
       // Currently, this mechanism hasn't been extended to work with gcroot.
       // There's no reason it couldn't be, but I haven't thought about the
@@ -551,6 +651,7 @@ static bool isKnownBaseResult(Value *V) {
 }
 
 namespace {
+
 /// Models the state of a single base defining value in the findBasePointer
 /// algorithm for determining where a new instruction is needed to propagate
 /// the base of this BDV.
@@ -558,7 +659,7 @@ class BDVState {
 public:
   enum Status { Unknown, Base, Conflict };
 
-  BDVState() : Status(Unknown), BaseValue(nullptr) {}
+  BDVState() : BaseValue(nullptr) {}
 
   explicit BDVState(Status Status, Value *BaseValue = nullptr)
       : Status(Status), BaseValue(BaseValue) {
@@ -597,16 +698,17 @@ public:
     case Conflict:
       OS << "C";
       break;
-    };
+    }
     OS << " (" << getBaseValue() << " - "
        << (getBaseValue() ? getBaseValue()->getName() : "nullptr") << "): ";
   }
 
 private:
-  Status Status;
+  Status Status = Unknown;
   AssertingVH<Value> BaseValue; // Non-null only if Status == Base.
 };
-}
+
+} // end anonymous namespace
 
 #ifndef NDEBUG
 static raw_ostream &operator<<(raw_ostream &OS, const BDVState &State) {
@@ -1169,7 +1271,7 @@ static void CreateGCRelocates(ArrayRef<Value *> LiveVariables,
     return;
 
   auto FindIndex = [](ArrayRef<Value *> LiveVec, Value *Val) {
-    auto ValIt = find(LiveVec, Val);
+    auto ValIt = llvm::find(LiveVec, Val);
     assert(ValIt != LiveVec.end() && "Val not found in LiveVec!");
     size_t Index = std::distance(LiveVec.begin(), ValIt);
     assert(Index < LiveVec.size() && "Bug in std::find?");
@@ -1229,7 +1331,7 @@ class DeferredReplacement {
   AssertingVH<Instruction> New;
   bool IsDeoptimize = false;
 
-  DeferredReplacement() {}
+  DeferredReplacement() = default;
 
 public:
   static DeferredReplacement createRAUW(Instruction *Old, Instruction *New) {
@@ -1286,7 +1388,8 @@ public:
     OldI->eraseFromParent();
   }
 };
-}
+
+} // end anonymous namespace
 
 static StringRef getDeoptLowering(CallSite CS) {
   const char *DeoptLowering = "deopt-lowering";
@@ -1304,7 +1407,6 @@ static StringRef getDeoptLowering(CallSite CS) {
   return "live-through";
 }
     
-
 static void
 makeStatepointExplicitImpl(const CallSite CS, /* to replace */
                            const SmallVectorImpl<Value *> &BasePtrs,
@@ -1528,7 +1630,6 @@ static void
 insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs,
                        DenseMap<Value *, Value *> &AllocaMap,
                        DenseSet<Value *> &VisitedLiveValues) {
-
   for (User *U : GCRelocs) {
     GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U);
     if (!Relocate)
@@ -1564,7 +1665,6 @@ static void insertRematerializationStores(
     const RematerializedValueMapTy &RematerializedValues,
     DenseMap<Value *, Value *> &AllocaMap,
     DenseSet<Value *> &VisitedLiveValues) {
-
   for (auto RematerializedValuePair: RematerializedValues) {
     Instruction *RematerializedValue = RematerializedValuePair.first;
     Value *OriginalValue = RematerializedValuePair.second;
@@ -1830,7 +1930,6 @@ static void findLiveReferences(
 static Value* findRematerializableChainToBasePointer(
   SmallVectorImpl<Instruction*> &ChainToBase,
   Value *CurrentValue) {
-
   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurrentValue)) {
     ChainToBase.push_back(GEP);
     return findRematerializableChainToBasePointer(ChainToBase,
@@ -1886,7 +1985,6 @@ chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain,
 }
 
 static bool AreEquivalentPhiNodes(PHINode &OrigRootPhi, PHINode &AlternateRootPhi) {
-
   unsigned PhiNum = OrigRootPhi.getNumIncomingValues();
   if (PhiNum != AlternateRootPhi.getNumIncomingValues() ||
       OrigRootPhi.getParent() != AlternateRootPhi.getParent())
@@ -1910,7 +2008,6 @@ static bool AreEquivalentPhiNodes(PHINode &OrigRootPhi, PHINode &AlternateRootPh
       return false;
   }
   return true;
-
 }
 
 // From the statepoint live set pick values that are cheaper to recompute then
@@ -2297,8 +2394,7 @@ static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH,
     AH.setAttributes(AH.getAttributes().removeAttributes(Ctx, Index, R));
 }
 
-void
-RewriteStatepointsForGC::stripNonValidAttributesFromPrototype(Function &F) {
+static void stripNonValidAttributesFromPrototype(Function &F) {
   LLVMContext &Ctx = F.getContext();
 
   for (Argument &A : F.args())
@@ -2310,8 +2406,10 @@ RewriteStatepointsForGC::stripNonValidAttributesFromPrototype(Function &F) {
     RemoveNonValidAttrAtIndex(Ctx, F, AttributeList::ReturnIndex);
 }
 
-void RewriteStatepointsForGC::stripInvalidMetadataFromInstruction(Instruction &I) {
-
+/// Certain metadata on instructions are invalid after running RS4GC.
+/// Optimizations that run after RS4GC can incorrectly use this metadata to
+/// optimize functions. We drop such metadata on the instruction.
+static void stripInvalidMetadataFromInstruction(Instruction &I) {
   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
     return;
   // These are the attributes that are still valid on loads and stores after
@@ -2337,18 +2435,32 @@ void RewriteStatepointsForGC::stripInvalidMetadataFromInstruction(Instruction &I
 
   // Drops all metadata on the instruction other than ValidMetadataAfterRS4GC.
   I.dropUnknownNonDebugMetadata(ValidMetadataAfterRS4GC);
-
 }
 
-void RewriteStatepointsForGC::stripNonValidAttributesAndMetadataFromBody(Function &F) {
+static void stripNonValidDataFromBody(Function &F) {
   if (F.empty())
     return;
 
   LLVMContext &Ctx = F.getContext();
   MDBuilder Builder(Ctx);
 
+  // Set of invariantstart instructions that we need to remove.
+  // Use this to avoid invalidating the instruction iterator.
+  SmallVector<IntrinsicInst*, 12> InvariantStartInstructions;
 
   for (Instruction &I : instructions(F)) {
+    // invariant.start on memory location implies that the referenced memory
+    // location is constant and unchanging. This is no longer true after
+    // RewriteStatepointsForGC runs because there can be calls to gc.statepoint
+    // which frees the entire heap and the presence of invariant.start allows
+    // the optimizer to sink the load of a memory location past a statepoint,
+    // which is incorrect.
+    if (auto *II = dyn_cast<IntrinsicInst>(&I))
+      if (II->getIntrinsicID() == Intrinsic::invariant_start) {
+        InvariantStartInstructions.push_back(II);
+        continue;
+      }
+
     if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) {
       assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!");
       bool IsImmutableTBAA =
@@ -2378,6 +2490,12 @@ void RewriteStatepointsForGC::stripNonValidAttributesAndMetadataFromBody(Functio
         RemoveNonValidAttrAtIndex(Ctx, CS, AttributeList::ReturnIndex);
     }
   }
+
+  // Delete the invariant.start instructions and RAUW undef.
+  for (auto *II : InvariantStartInstructions) {
+    II->replaceAllUsesWith(UndefValue::get(II->getType()));
+    II->eraseFromParent();
+  }
 }
 
 /// Returns true if this function should be rewritten by this pass.  The main
@@ -2394,35 +2512,28 @@ static bool shouldRewriteStatepointsIn(Function &F) {
     return false;
 }
 
-void RewriteStatepointsForGC::stripNonValidAttributesAndMetadata(Module &M) {
+static void stripNonValidData(Module &M) {
 #ifndef NDEBUG
-  assert(any_of(M, shouldRewriteStatepointsIn) && "precondition!");
+  assert(llvm::any_of(M, shouldRewriteStatepointsIn) && "precondition!");
 #endif
 
   for (Function &F : M)
     stripNonValidAttributesFromPrototype(F);
 
   for (Function &F : M)
-    stripNonValidAttributesAndMetadataFromBody(F);
+    stripNonValidDataFromBody(F);
 }
 
-bool RewriteStatepointsForGC::runOnFunction(Function &F) {
-  // Nothing to do for declarations.
-  if (F.isDeclaration() || F.empty())
-    return false;
-
-  // Policy choice says not to rewrite - the most common reason is that we're
-  // compiling code without a GCStrategy.
-  if (!shouldRewriteStatepointsIn(F))
-    return false;
-
-  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
-  TargetTransformInfo &TTI =
-      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
+                                            TargetTransformInfo &TTI,
+                                            const TargetLibraryInfo &TLI) {
+  assert(!F.isDeclaration() && !F.empty() &&
+         "need function body to rewrite statepoints in");
+  assert(shouldRewriteStatepointsIn(F) && "mismatch in rewrite decision");
 
-  auto NeedsRewrite = [](Instruction &I) {
+  auto NeedsRewrite = [&TLI](Instruction &I) {
     if (ImmutableCallSite CS = ImmutableCallSite(&I))
-      return !callsGCLeafFunction(CS) && !isStatepoint(CS);
+      return !callsGCLeafFunction(CS, TLI) && !isStatepoint(CS);
     return false;
   };
 
@@ -2662,7 +2773,6 @@ static void computeLiveInValues(DominatorTree &DT, Function &F,
 
 static void findLiveSetAtInst(Instruction *Inst, GCPtrLivenessData &Data,
                               StatepointLiveSetTy &Out) {
-
   BasicBlock *BB = Inst->getParent();
 
   // Note: The copy is intentional and required
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 4822cf7cce0f..e5866b4718da 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -18,30 +18,49 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO/SCCP.h"
+#include "llvm/Transforms/Scalar/SCCP.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueLattice.h"
+#include "llvm/Analysis/ValueLatticeUtils.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/SCCP.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include <algorithm>
+#include <cassert>
+#include <utility>
+#include <vector>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "sccp"
@@ -52,8 +71,11 @@ STATISTIC(NumDeadBlocks , "Number of basic blocks unreachable");
 STATISTIC(IPNumInstRemoved, "Number of instructions removed by IPSCCP");
 STATISTIC(IPNumArgsElimed ,"Number of arguments constant propagated by IPSCCP");
 STATISTIC(IPNumGlobalConst, "Number of globals found to be constant by IPSCCP");
+STATISTIC(IPNumRangeInfoUsed, "Number of times constant range info was used by"
+                              "IPSCCP");
 
 namespace {
+
 /// LatticeVal class - This class represents the different lattice values that
 /// an LLVM value may occupy.  It is a simple class with value semantics.
 ///
@@ -88,9 +110,11 @@ public:
   LatticeVal() : Val(nullptr, unknown) {}
 
   bool isUnknown() const { return getLatticeValue() == unknown; }
+
   bool isConstant() const {
     return getLatticeValue() == constant || getLatticeValue() == forcedconstant;
   }
+
   bool isOverdefined() const { return getLatticeValue() == overdefined; }
 
   Constant *getConstant() const {
@@ -153,11 +177,15 @@ public:
     Val.setInt(forcedconstant);
     Val.setPointer(V);
   }
-};
-} // end anonymous namespace.
-
 
-namespace {
+  ValueLatticeElement toValueLattice() const {
+    if (isOverdefined())
+      return ValueLatticeElement::getOverdefined();
+    if (isConstant())
+      return ValueLatticeElement::get(getConstant());
+    return ValueLatticeElement();
+  }
+};
 
 //===----------------------------------------------------------------------===//
 //
@@ -167,37 +195,38 @@ namespace {
 class SCCPSolver : public InstVisitor<SCCPSolver> {
   const DataLayout &DL;
   const TargetLibraryInfo *TLI;
-  SmallPtrSet<BasicBlock*, 8> BBExecutable; // The BBs that are executable.
-  DenseMap<Value*, LatticeVal> ValueState;  // The state each value is in.
+  SmallPtrSet<BasicBlock *, 8> BBExecutable; // The BBs that are executable.
+  DenseMap<Value *, LatticeVal> ValueState;  // The state each value is in.
+  // The state each parameter is in.
+  DenseMap<Value *, ValueLatticeElement> ParamState;
 
   /// StructValueState - This maintains ValueState for values that have
   /// StructType, for example for formal arguments, calls, insertelement, etc.
-  ///
-  DenseMap<std::pair<Value*, unsigned>, LatticeVal> StructValueState;
+  DenseMap<std::pair<Value *, unsigned>, LatticeVal> StructValueState;
 
   /// GlobalValue - If we are tracking any values for the contents of a global
   /// variable, we keep a mapping from the constant accessor to the element of
   /// the global, to the currently known value.  If the value becomes
   /// overdefined, it's entry is simply removed from this map.
-  DenseMap<GlobalVariable*, LatticeVal> TrackedGlobals;
+  DenseMap<GlobalVariable *, LatticeVal> TrackedGlobals;
 
   /// TrackedRetVals - If we are tracking arguments into and the return
   /// value out of a function, it will have an entry in this map, indicating
   /// what the known return value for the function is.
-  DenseMap<Function*, LatticeVal> TrackedRetVals;
+  DenseMap<Function *, LatticeVal> TrackedRetVals;
 
   /// TrackedMultipleRetVals - Same as TrackedRetVals, but used for functions
   /// that return multiple values.
-  DenseMap<std::pair<Function*, unsigned>, LatticeVal> TrackedMultipleRetVals;
+  DenseMap<std::pair<Function *, unsigned>, LatticeVal> TrackedMultipleRetVals;
 
   /// MRVFunctionsTracked - Each function in TrackedMultipleRetVals is
   /// represented here for efficient lookup.
-  SmallPtrSet<Function*, 16> MRVFunctionsTracked;
+  SmallPtrSet<Function *, 16> MRVFunctionsTracked;
 
   /// TrackingIncomingArguments - This is the set of functions for whose
   /// arguments we make optimistic assumptions about and try to prove as
   /// constants.
-  SmallPtrSet<Function*, 16> TrackingIncomingArguments;
+  SmallPtrSet<Function *, 16> TrackingIncomingArguments;
 
   /// The reason for two worklists is that overdefined is the lowest state
   /// on the lattice, and moving things to overdefined as fast as possible
@@ -206,16 +235,17 @@ class SCCPSolver : public InstVisitor<SCCPSolver> {
   /// By having a separate worklist, we accomplish this because everything
   /// possibly overdefined will become overdefined at the soonest possible
   /// point.
-  SmallVector<Value*, 64> OverdefinedInstWorkList;
-  SmallVector<Value*, 64> InstWorkList;
-
+  SmallVector<Value *, 64> OverdefinedInstWorkList;
+  SmallVector<Value *, 64> InstWorkList;
 
-  SmallVector<BasicBlock*, 64>  BBWorkList;  // The BasicBlock work list
+  // The BasicBlock work list
+  SmallVector<BasicBlock *, 64>  BBWorkList;
 
   /// KnownFeasibleEdges - Entries in this set are edges which have already had
   /// PHI nodes retriggered.
-  typedef std::pair<BasicBlock*, BasicBlock*> Edge;
+  using Edge = std::pair<BasicBlock *, BasicBlock *>;
   DenseSet<Edge> KnownFeasibleEdges;
+
 public:
   SCCPSolver(const DataLayout &DL, const TargetLibraryInfo *tli)
       : DL(DL), TLI(tli) {}
@@ -263,8 +293,13 @@ public:
     TrackingIncomingArguments.insert(F);
   }
 
+  /// Returns true if the given function is in the solver's set of
+  /// argument-tracked functions.
+  bool isArgumentTrackedFunction(Function *F) {
+    return TrackingIncomingArguments.count(F);
+  }
+
   /// Solve - Solve for constants and executable blocks.
-  ///
   void Solve();
 
   /// ResolvedUndefsIn - While solving the dataflow for a function, we assume
@@ -290,14 +325,23 @@ public:
     return StructValues;
   }
 
-  LatticeVal getLatticeValueFor(Value *V) const {
-    DenseMap<Value*, LatticeVal>::const_iterator I = ValueState.find(V);
-    assert(I != ValueState.end() && "V is not in valuemap!");
-    return I->second;
+  ValueLatticeElement getLatticeValueFor(Value *V) {
+    assert(!V->getType()->isStructTy() &&
+           "Should use getStructLatticeValueFor");
+    std::pair<DenseMap<Value*, ValueLatticeElement>::iterator, bool>
+        PI = ParamState.insert(std::make_pair(V, ValueLatticeElement()));
+    ValueLatticeElement &LV = PI.first->second;
+    if (PI.second) {
+      DenseMap<Value*, LatticeVal>::const_iterator I = ValueState.find(V);
+      assert(I != ValueState.end() &&
+             "V not found in ValueState nor Paramstate map!");
+      LV = I->second.toValueLattice();
+    }
+
+    return LV;
   }
 
   /// getTrackedRetVals - Get the inferred return value map.
-  ///
   const DenseMap<Function*, LatticeVal> &getTrackedRetVals() {
     return TrackedRetVals;
   }
@@ -349,7 +393,6 @@ private:
   // markConstant - Make a value be marked as "constant".  If the value
   // is not already a constant, add it to the instruction work list so that
   // the users of the instruction are updated later.
-  //
   void markConstant(LatticeVal &IV, Value *V, Constant *C) {
     if (!IV.markConstant(C)) return;
     DEBUG(dbgs() << "markConstant: " << *C << ": " << *V << '\n');
@@ -369,7 +412,6 @@ private:
     pushToWorkList(IV, V);
   }
 
-
   // markOverdefined - Make a value be marked as "overdefined". If the
   // value is not already overdefined, add it to the overdefined instruction
   // work list so that the users of the instruction are updated later.
@@ -402,7 +444,6 @@ private:
     mergeInValue(ValueState[V], V, MergeWithV);
   }
 
-
   /// getValueState - Return the LatticeVal object that corresponds to the
   /// value.  This function handles the case when the value hasn't been seen yet
   /// by properly seeding constants etc.
@@ -426,6 +467,18 @@ private:
     return LV;
   }
 
+  ValueLatticeElement &getParamState(Value *V) {
+    assert(!V->getType()->isStructTy() && "Should use getStructValueState");
+
+    std::pair<DenseMap<Value*, ValueLatticeElement>::iterator, bool>
+        PI = ParamState.insert(std::make_pair(V, ValueLatticeElement()));
+    ValueLatticeElement &LV = PI.first->second;
+    if (PI.second)
+      LV = getValueState(V).toValueLattice();
+
+    return LV;
+  }
+
   /// getStructValueState - Return the LatticeVal object that corresponds to the
   /// value/field pair.  This function handles the case when the value hasn't
   /// been seen yet by properly seeding constants etc.
@@ -457,7 +510,6 @@ private:
     return LV;
   }
 
-
   /// markEdgeExecutable - Mark a basic block as executable, adding it to the BB
   /// work list if it is not already executable.
   void markEdgeExecutable(BasicBlock *Source, BasicBlock *Dest) {
@@ -480,18 +532,15 @@ private:
 
   // getFeasibleSuccessors - Return a vector of booleans to indicate which
   // successors are reachable from a given terminator instruction.
-  //
   void getFeasibleSuccessors(TerminatorInst &TI, SmallVectorImpl<bool> &Succs);
 
   // isEdgeFeasible - Return true if the control flow edge from the 'From' basic
   // block to the 'To' basic block is currently feasible.
-  //
   bool isEdgeFeasible(BasicBlock *From, BasicBlock *To);
 
   // OperandChangedState - This method is invoked on all of the users of an
   // instruction that was just changed state somehow.  Based on this
   // information, we need to update the specified user of this instruction.
-  //
   void OperandChangedState(Instruction *I) {
     if (BBExecutable.count(I->getParent()))   // Inst is executable?
       visit(*I);
@@ -506,6 +555,7 @@ private:
   void visitPHINode(PHINode &I);
 
   // Terminators
+
   void visitReturnInst(ReturnInst &I);
   void visitTerminatorInst(TerminatorInst &TI);
 
@@ -515,26 +565,32 @@ private:
   void visitCmpInst(CmpInst &I);
   void visitExtractValueInst(ExtractValueInst &EVI);
   void visitInsertValueInst(InsertValueInst &IVI);
+
   void visitCatchSwitchInst(CatchSwitchInst &CPI) {
     markOverdefined(&CPI);
     visitTerminatorInst(CPI);
   }
 
   // Instructions that cannot be folded away.
+
   void visitStoreInst     (StoreInst &I);
   void visitLoadInst      (LoadInst &I);
   void visitGetElementPtrInst(GetElementPtrInst &I);
+
   void visitCallInst      (CallInst &I) {
     visitCallSite(&I);
   }
+
   void visitInvokeInst    (InvokeInst &II) {
     visitCallSite(&II);
     visitTerminatorInst(II);
   }
+
   void visitCallSite      (CallSite CS);
   void visitResumeInst    (TerminatorInst &I) { /*returns void*/ }
   void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ }
   void visitFenceInst     (FenceInst &I) { /*returns void*/ }
+
   void visitInstruction(Instruction &I) {
     // All the instructions we don't do any special handling for just
     // go to overdefined.
@@ -545,10 +601,8 @@ private:
 
 } // end anonymous namespace
 
-
 // getFeasibleSuccessors - Return a vector of booleans to indicate which
 // successors are reachable from a given terminator instruction.
-//
 void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
                                        SmallVectorImpl<bool> &Succs) {
   Succs.resize(TI.getNumSuccessors());
@@ -631,10 +685,8 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
   llvm_unreachable("SCCP: Don't know how to handle this terminator!");
 }
 
-
 // isEdgeFeasible - Return true if the control flow edge from the 'From' basic
 // block to the 'To' basic block is currently feasible.
-//
 bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
   assert(BBExecutable.count(To) && "Dest should always be alive!");
 
@@ -710,7 +762,6 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
 //    destination executable
 // 7. If a conditional branch has a value that is overdefined, make all
 //    successors executable.
-//
 void SCCPSolver::visitPHINode(PHINode &PN) {
   // If this PN returns a struct, just mark the result overdefined.
   // TODO: We could do a lot better than this if code actually uses this.
@@ -730,7 +781,6 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
   // constant, and they agree with each other, the PHI becomes the identical
   // constant.  If they are constant and don't agree, the PHI is overdefined.
   // If there are no executable operands, the PHI remains unknown.
-  //
   Constant *OperandVal = nullptr;
   for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
     LatticeVal IV = getValueState(PN.getIncomingValue(i));
@@ -761,7 +811,6 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
   // arguments that agree with each other(and OperandVal is the constant) or
   // OperandVal is null because there are no defined incoming arguments.  If
   // this is the case, the PHI remains unknown.
-  //
   if (OperandVal)
     markConstant(&PN, OperandVal);      // Acquire operand value
 }
@@ -789,7 +838,6 @@ void SCCPSolver::visitReturnInst(ReturnInst &I) {
         for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
           mergeInValue(TrackedMultipleRetVals[std::make_pair(F, i)], F,
                        getStructValueState(ResultOp, i));
-
   }
 }
 
@@ -820,7 +868,6 @@ void SCCPSolver::visitCastInst(CastInst &I) {
   }
 }
 
-
 void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
   // If this returns a struct, mark all elements over defined, we don't track
   // structs in structs.
@@ -969,7 +1016,6 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
     }
   }
 
-
   markOverdefined(&I);
 }
 
@@ -998,7 +1044,6 @@ void SCCPSolver::visitCmpInst(CmpInst &I) {
 
 // Handle getelementptr instructions.  If all operands are constants then we
 // can turn this into a getelementptr ConstantExpr.
-//
 void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) {
   if (ValueState[&I].isOverdefined()) return;
 
@@ -1044,7 +1089,6 @@ void SCCPSolver::visitStoreInst(StoreInst &SI) {
     TrackedGlobals.erase(I);      // No need to keep tracking this!
 }
 
-
 // Handle load instructions.  If the operand is a constant pointer to a constant
 // global, we can replace the load with the loaded constant value!
 void SCCPSolver::visitLoadInst(LoadInst &I) {
@@ -1108,7 +1152,6 @@ CallOverdefined:
     // a declaration, maybe we can constant fold it.
     if (F && F->isDeclaration() && !I->getType()->isStructTy() &&
         canConstantFoldCallTo(CS, F)) {
-
       SmallVector<Constant*, 8> Operands;
       for (CallSite::arg_iterator AI = CS.arg_begin(), E = CS.arg_end();
            AI != E; ++AI) {
@@ -1162,6 +1205,9 @@ CallOverdefined:
           mergeInValue(getStructValueState(&*AI, i), &*AI, CallArg);
         }
       } else {
+        // Most other parts of the Solver still only use the simpler value
+        // lattice, so we propagate changes for parameters to both lattices.
+        getParamState(&*AI).mergeIn(getValueState(*CAI).toValueLattice(), DL);
         mergeInValue(&*AI, getValueState(*CAI));
       }
     }
@@ -1360,7 +1406,6 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         // undef & X -> 0.   X could be zero.
         markForcedConstant(&I, Constant::getNullValue(ITy));
         return true;
-
       case Instruction::Or:
         // Both operands undef -> undef
         if (Op0LV.isUnknown() && Op1LV.isUnknown())
@@ -1368,7 +1413,6 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         // undef | X -> -1.   X could be -1.
         markForcedConstant(&I, Constant::getAllOnesValue(ITy));
         return true;
-
       case Instruction::Xor:
         // undef ^ undef -> 0; strictly speaking, this is not strictly
         // necessary, but we try to be nice to people who expect this
@@ -1379,7 +1423,6 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         }
         // undef ^ X -> undef
         break;
-
       case Instruction::SDiv:
       case Instruction::UDiv:
       case Instruction::SRem:
@@ -1397,7 +1440,6 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         // undef % X -> 0.   X could be 1.
         markForcedConstant(&I, Constant::getNullValue(ITy));
         return true;
-
       case Instruction::AShr:
         // X >>a undef -> undef.
         if (Op1LV.isUnknown()) break;
@@ -1464,7 +1506,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         markOverdefined(&I);
         return true;
       case Instruction::Call:
-      case Instruction::Invoke: {
+      case Instruction::Invoke:
         // There are two reasons a call can have an undef result
         // 1. It could be tracked.
         // 2. It could be constant-foldable.
@@ -1478,7 +1520,6 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         // we do not know what return values are valid.
         markOverdefined(&I);
         return true;
-      }
       default:
         // If we don't know what should happen here, conservatively mark it
         // overdefined.
@@ -1557,11 +1598,56 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
   return false;
 }
 
+static bool tryToReplaceWithConstantRange(SCCPSolver &Solver, Value *V) {
+  bool Changed = false;
+
+  // Currently we only use range information for integer values.
+  if (!V->getType()->isIntegerTy())
+    return false;
+
+  const ValueLatticeElement &IV = Solver.getLatticeValueFor(V);
+  if (!IV.isConstantRange())
+    return false;
+
+  for (auto UI = V->uses().begin(), E = V->uses().end(); UI != E;) {
+    const Use &U = *UI++;
+    auto *Icmp = dyn_cast<ICmpInst>(U.getUser());
+    if (!Icmp || !Solver.isBlockExecutable(Icmp->getParent()))
+      continue;
+
+    auto getIcmpLatticeValue = [&](Value *Op) {
+      if (auto *C = dyn_cast<Constant>(Op))
+        return ValueLatticeElement::get(C);
+      return Solver.getLatticeValueFor(Op);
+    };
+
+    ValueLatticeElement A = getIcmpLatticeValue(Icmp->getOperand(0));
+    ValueLatticeElement B = getIcmpLatticeValue(Icmp->getOperand(1));
+
+    Constant *C = nullptr;
+    if (A.satisfiesPredicate(Icmp->getPredicate(), B))
+      C = ConstantInt::getTrue(Icmp->getType());
+    else if (A.satisfiesPredicate(Icmp->getInversePredicate(), B))
+      C = ConstantInt::getFalse(Icmp->getType());
+
+    if (C) {
+      Icmp->replaceAllUsesWith(C);
+      DEBUG(dbgs() << "Replacing " << *Icmp << " with " << *C
+                   << ", because of range information " << A << " " << B
+                   << "\n");
+      Icmp->eraseFromParent();
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
 static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
   Constant *Const = nullptr;
   if (V->getType()->isStructTy()) {
     std::vector<LatticeVal> IVs = Solver.getStructLatticeValueFor(V);
-    if (any_of(IVs, [](const LatticeVal &LV) { return LV.isOverdefined(); }))
+    if (llvm::any_of(IVs,
+                     [](const LatticeVal &LV) { return LV.isOverdefined(); }))
       return false;
     std::vector<Constant *> ConstVals;
     auto *ST = dyn_cast<StructType>(V->getType());
@@ -1573,10 +1659,19 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
     }
     Const = ConstantStruct::get(ST, ConstVals);
   } else {
-    LatticeVal IV = Solver.getLatticeValueFor(V);
+    const ValueLatticeElement &IV = Solver.getLatticeValueFor(V);
     if (IV.isOverdefined())
       return false;
-    Const = IV.isConstant() ? IV.getConstant() : UndefValue::get(V->getType());
+
+    if (IV.isConstantRange()) {
+      if (IV.getConstantRange().isSingleElement())
+        Const =
+            ConstantInt::get(V->getType(), IV.asConstantInteger().getValue());
+      else
+        return false;
+    } else
+      Const =
+          IV.isConstant() ? IV.getConstant() : UndefValue::get(V->getType());
   }
   assert(Const && "Constant is nullptr here!");
   DEBUG(dbgs() << "  Constant: " << *Const << " = " << *V << '\n');
@@ -1588,7 +1683,6 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
 
 // runSCCP() - Run the Sparse Conditional Constant Propagation algorithm,
 // and return true if the function was modified.
-//
 static bool runSCCP(Function &F, const DataLayout &DL,
                     const TargetLibraryInfo *TLI) {
   DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
@@ -1628,7 +1722,6 @@ static bool runSCCP(Function &F, const DataLayout &DL,
 
     // Iterate over all of the instructions in a function, replacing them with
     // constants if we have found them to be of constant values.
-    //
     for (BasicBlock::iterator BI = BB.begin(), E = BB.end(); BI != E;) {
       Instruction *Inst = &*BI++;
       if (Inst->getType()->isVoidTy() || isa<TerminatorInst>(Inst))
@@ -1659,6 +1752,7 @@ PreservedAnalyses SCCPPass::run(Function &F, FunctionAnalysisManager &AM) {
 }
 
 namespace {
+
 //===--------------------------------------------------------------------===//
 //
 /// SCCP Class - This class uses the SCCPSolver to implement a per-function
@@ -1666,18 +1760,20 @@ namespace {
 ///
 class SCCPLegacyPass : public FunctionPass {
 public:
+  // Pass identification, replacement for typeid
+  static char ID;
+
+  SCCPLegacyPass() : FunctionPass(ID) {
+    initializeSCCPLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
-  static char ID; // Pass identification, replacement for typeid
-  SCCPLegacyPass() : FunctionPass(ID) {
-    initializeSCCPLegacyPassPass(*PassRegistry::getPassRegistry());
-  }
 
   // runOnFunction - Run the Sparse Conditional Constant Propagation
   // algorithm, and return true if the function was modified.
-  //
   bool runOnFunction(Function &F) override {
     if (skipFunction(F))
       return false;
@@ -1687,9 +1783,11 @@ public:
     return runSCCP(F, DL, TLI);
   }
 };
+
 } // end anonymous namespace
 
 char SCCPLegacyPass::ID = 0;
+
 INITIALIZE_PASS_BEGIN(SCCPLegacyPass, "sccp",
                       "Sparse Conditional Constant Propagation", false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
@@ -1699,38 +1797,11 @@ INITIALIZE_PASS_END(SCCPLegacyPass, "sccp",
 // createSCCPPass - This is the public interface to this file.
 FunctionPass *llvm::createSCCPPass() { return new SCCPLegacyPass(); }
 
-static bool AddressIsTaken(const GlobalValue *GV) {
-  // Delete any dead constantexpr klingons.
-  GV->removeDeadConstantUsers();
-
-  for (const Use &U : GV->uses()) {
-    const User *UR = U.getUser();
-    if (const auto *SI = dyn_cast<StoreInst>(UR)) {
-      if (SI->getOperand(0) == GV || SI->isVolatile())
-        return true;  // Storing addr of GV.
-    } else if (isa<InvokeInst>(UR) || isa<CallInst>(UR)) {
-      // Make sure we are calling the function, not passing the address.
-      ImmutableCallSite CS(cast<Instruction>(UR));
-      if (!CS.isCallee(&U))
-        return true;
-    } else if (const auto *LI = dyn_cast<LoadInst>(UR)) {
-      if (LI->isVolatile())
-        return true;
-    } else if (isa<BlockAddress>(UR)) {
-      // blockaddress doesn't take the address of the function, it takes addr
-      // of label.
-    } else {
-      return true;
-    }
-  }
-  return false;
-}
-
 static void findReturnsToZap(Function &F,
-                             SmallPtrSet<Function *, 32> &AddressTakenFunctions,
-                             SmallVector<ReturnInst *, 8> &ReturnsToZap) {
+                             SmallVector<ReturnInst *, 8> &ReturnsToZap,
+                             SCCPSolver &Solver) {
   // We can only do this if we know that nothing else can call the function.
-  if (!F.hasLocalLinkage() || AddressTakenFunctions.count(&F))
+  if (!Solver.isArgumentTrackedFunction(&F))
     return;
 
   for (BasicBlock &BB : F)
@@ -1743,39 +1814,22 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
                       const TargetLibraryInfo *TLI) {
   SCCPSolver Solver(DL, TLI);
 
-  // AddressTakenFunctions - This set keeps track of the address-taken functions
-  // that are in the input.  As IPSCCP runs through and simplifies code,
-  // functions that were address taken can end up losing their
-  // address-taken-ness.  Because of this, we keep track of their addresses from
-  // the first pass so we can use them for the later simplification pass.
-  SmallPtrSet<Function*, 32> AddressTakenFunctions;
-
   // Loop over all functions, marking arguments to those with their addresses
   // taken or that are external as overdefined.
-  //
   for (Function &F : M) {
     if (F.isDeclaration())
       continue;
 
-    // If this is an exact definition of this function, then we can propagate
-    // information about its result into callsites of it.
-    // Don't touch naked functions. They may contain asm returning a
-    // value we don't see, so we may end up interprocedurally propagating
-    // the return value incorrectly.
-    if (F.hasExactDefinition() && !F.hasFnAttribute(Attribute::Naked))
+    // Determine if we can track the function's return values. If so, add the
+    // function to the solver's set of return-tracked functions.
+    if (canTrackReturnsInterprocedurally(&F))
       Solver.AddTrackedFunction(&F);
 
-    // If this function only has direct calls that we can see, we can track its
-    // arguments and return value aggressively, and can assume it is not called
-    // unless we see evidence to the contrary.
-    if (F.hasLocalLinkage()) {
-      if (F.hasAddressTaken()) {
-        AddressTakenFunctions.insert(&F);
-      }
-      else {
-        Solver.AddArgumentTrackedFunction(&F);
-        continue;
-      }
+    // Determine if we can track the function's arguments. If so, add the
+    // function to the solver's set of argument-tracked functions.
+    if (canTrackArgumentsInterprocedurally(&F)) {
+      Solver.AddArgumentTrackedFunction(&F);
+      continue;
     }
 
     // Assume the function is called.
@@ -1786,13 +1840,14 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
       Solver.markOverdefined(&AI);
   }
 
-  // Loop over global variables.  We inform the solver about any internal global
-  // variables that do not have their 'addresses taken'.  If they don't have
-  // their addresses taken, we can propagate constants through them.
-  for (GlobalVariable &G : M.globals())
-    if (!G.isConstant() && G.hasLocalLinkage() &&
-        G.hasDefinitiveInitializer() && !AddressIsTaken(&G))
+  // Determine if we can track any of the module's global variables. If so, add
+  // the global variables we can track to the solver's set of tracked global
+  // variables.
+  for (GlobalVariable &G : M.globals()) {
+    G.removeDeadConstantUsers();
+    if (canTrackGlobalVariableInterprocedurally(&G))
       Solver.TrackValueOfGlobalVariable(&G);
+  }
 
   // Solve for constants.
   bool ResolvedUndefs = true;
@@ -1809,7 +1864,6 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
 
   // Iterate over all of the instructions in the module, replacing them with
   // constants if we have found them to be of constant values.
-  //
   SmallVector<BasicBlock*, 512> BlocksToErase;
 
   for (Function &F : M) {
@@ -1818,9 +1872,15 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
 
     if (Solver.isBlockExecutable(&F.front()))
       for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E;
-           ++AI)
-        if (!AI->use_empty() && tryToReplaceWithConstant(Solver, &*AI))
+           ++AI) {
+        if (!AI->use_empty() && tryToReplaceWithConstant(Solver, &*AI)) {
           ++IPNumArgsElimed;
+          continue;
+        }
+
+        if (!AI->use_empty() && tryToReplaceWithConstantRange(Solver, &*AI))
+          ++IPNumRangeInfoUsed;
+      }
 
     for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
       if (!Solver.isBlockExecutable(&*BB)) {
@@ -1897,7 +1957,7 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
     Function *F = I.first;
     if (I.second.isOverdefined() || F->getReturnType()->isVoidTy())
       continue;
-    findReturnsToZap(*F, AddressTakenFunctions, ReturnsToZap);
+    findReturnsToZap(*F, ReturnsToZap, Solver);
   }
 
   for (const auto &F : Solver.getMRVFunctionsTracked()) {
@@ -1905,7 +1965,7 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
            "The return type should be a struct");
     StructType *STy = cast<StructType>(F->getReturnType());
     if (Solver.isStructLatticeConstant(F, STy))
-      findReturnsToZap(*F, AddressTakenFunctions, ReturnsToZap);
+      findReturnsToZap(*F, ReturnsToZap, Solver);
   }
 
   // Zap all returns which we've identified as zap to change.
@@ -1943,6 +2003,7 @@ PreservedAnalyses IPSCCPPass::run(Module &M, ModuleAnalysisManager &AM) {
 }
 
 namespace {
+
 //===--------------------------------------------------------------------===//
 //
 /// IPSCCP Class - This class implements interprocedural Sparse Conditional
@@ -1969,9 +2030,11 @@ public:
     AU.addRequired<TargetLibraryInfoWrapperPass>();
   }
 };
+
 } // end anonymous namespace
 
 char IPSCCPLegacyPass::ID = 0;
+
 INITIALIZE_PASS_BEGIN(IPSCCPLegacyPass, "ipsccp",
                       "Interprocedural Sparse Conditional Constant Propagation",
                       false, false)
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index b9cee5b2ba95..bfe3754f0769 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -24,28 +24,54 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/SROA.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/PtrUseVisitor.h"
-#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantFolder.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/Chrono.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
@@ -55,6 +81,17 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
 
 #ifndef NDEBUG
 // We only use this for a debug check.
@@ -87,11 +124,18 @@ static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices",
 static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false),
                                         cl::Hidden);
 
+/// Hidden option to allow more aggressive splitting.
+static cl::opt<bool>
+SROASplitNonWholeAllocaSlices("sroa-split-nonwhole-alloca-slices",
+                              cl::init(false), cl::Hidden);
+
 namespace {
+
 /// \brief A custom IRBuilder inserter which prefixes all names, but only in
 /// Assert builds.
 class IRBuilderPrefixedInserter : public IRBuilderDefaultInserter {
   std::string Prefix;
+
   const Twine getNameWithPrefix(const Twine &Name) const {
     return Name.isTriviallyEmpty() ? Name : Prefix + Name;
   }
@@ -107,11 +151,9 @@ protected:
   }
 };
 
-/// \brief Provide a typedef for IRBuilder that drops names in release builds.
-using IRBuilderTy = llvm::IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>;
-}
+/// \brief Provide a type for IRBuilder that drops names in release builds.
+using IRBuilderTy = IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>;
 
-namespace {
 /// \brief A used slice of an alloca.
 ///
 /// This structure represents a slice of an alloca used by some instruction. It
@@ -120,17 +162,18 @@ namespace {
 /// or not when forming partitions of the alloca.
 class Slice {
   /// \brief The beginning offset of the range.
-  uint64_t BeginOffset;
+  uint64_t BeginOffset = 0;
 
   /// \brief The ending offset, not included in the range.
-  uint64_t EndOffset;
+  uint64_t EndOffset = 0;
 
   /// \brief Storage for both the use of this slice and whether it can be
   /// split.
   PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
 
 public:
-  Slice() : BeginOffset(), EndOffset() {}
+  Slice() = default;
+
   Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
       : BeginOffset(BeginOffset), EndOffset(EndOffset),
         UseAndIsSplittable(U, IsSplittable) {}
@@ -180,12 +223,15 @@ public:
   }
   bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
 };
+
 } // end anonymous namespace
 
 namespace llvm {
+
 template <typename T> struct isPodLike;
 template <> struct isPodLike<Slice> { static const bool value = true; };
-}
+
+} // end namespace llvm
 
 /// \brief Representation of the alloca slices.
 ///
@@ -207,13 +253,15 @@ public:
 
   /// \brief Support for iterating over the slices.
   /// @{
-  typedef SmallVectorImpl<Slice>::iterator iterator;
-  typedef iterator_range<iterator> range;
+  using iterator = SmallVectorImpl<Slice>::iterator;
+  using range = iterator_range<iterator>;
+
   iterator begin() { return Slices.begin(); }
   iterator end() { return Slices.end(); }
 
-  typedef SmallVectorImpl<Slice>::const_iterator const_iterator;
-  typedef iterator_range<const_iterator> const_range;
+  using const_iterator = SmallVectorImpl<Slice>::const_iterator;
+  using const_range = iterator_range<const_iterator>;
+
   const_iterator begin() const { return Slices.begin(); }
   const_iterator end() const { return Slices.end(); }
   /// @}
@@ -264,6 +312,7 @@ public:
 private:
   template <typename DerivedT, typename RetT = void> class BuilderBase;
   class SliceBuilder;
+
   friend class AllocaSlices::SliceBuilder;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -320,7 +369,7 @@ private:
   friend class AllocaSlices;
   friend class AllocaSlices::partition_iterator;
 
-  typedef AllocaSlices::iterator iterator;
+  using iterator = AllocaSlices::iterator;
 
   /// \brief The beginning and ending offsets of the alloca for this
   /// partition.
@@ -403,12 +452,12 @@ class AllocaSlices::partition_iterator
 
   /// \brief We also need to keep track of the maximum split end offset seen.
   /// FIXME: Do we really?
-  uint64_t MaxSplitSliceEndOffset;
+  uint64_t MaxSplitSliceEndOffset = 0;
 
   /// \brief Sets the partition to be empty at given iterator, and sets the
   /// end iterator.
   partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
-      : P(SI), SE(SE), MaxSplitSliceEndOffset(0) {
+      : P(SI), SE(SE) {
     // If not already at the end, advance our state to form the initial
     // partition.
     if (SI != SE)
@@ -432,19 +481,21 @@ class AllocaSlices::partition_iterator
         // Remove the uses which have ended in the prior partition. This
         // cannot change the max split slice end because we just checked that
         // the prior partition ended prior to that max.
-        P.SplitTails.erase(
-            remove_if(P.SplitTails,
-                      [&](Slice *S) { return S->endOffset() <= P.EndOffset; }),
-            P.SplitTails.end());
-        assert(any_of(P.SplitTails,
-                      [&](Slice *S) {
-                        return S->endOffset() == MaxSplitSliceEndOffset;
-                      }) &&
+        P.SplitTails.erase(llvm::remove_if(P.SplitTails,
+                                           [&](Slice *S) {
+                                             return S->endOffset() <=
+                                                    P.EndOffset;
+                                           }),
+                           P.SplitTails.end());
+        assert(llvm::any_of(P.SplitTails,
+                            [&](Slice *S) {
+                              return S->endOffset() == MaxSplitSliceEndOffset;
+                            }) &&
                "Could not find the current max split slice offset!");
-        assert(all_of(P.SplitTails,
-                      [&](Slice *S) {
-                        return S->endOffset() <= MaxSplitSliceEndOffset;
-                      }) &&
+        assert(llvm::all_of(P.SplitTails,
+                            [&](Slice *S) {
+                              return S->endOffset() <= MaxSplitSliceEndOffset;
+                            }) &&
                "Max split slice end offset is not actually the max!");
       }
     }
@@ -608,7 +659,8 @@ static Value *foldPHINodeOrSelectInst(Instruction &I) {
 class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
   friend class PtrUseVisitor<SliceBuilder>;
   friend class InstVisitor<SliceBuilder>;
-  typedef PtrUseVisitor<SliceBuilder> Base;
+
+  using Base = PtrUseVisitor<SliceBuilder>;
 
   const uint64_t AllocSize;
   AllocaSlices &AS;
@@ -996,8 +1048,9 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
     return;
   }
 
-  Slices.erase(remove_if(Slices, [](const Slice &S) { return S.isDead(); }),
-               Slices.end());
+  Slices.erase(
+      llvm::remove_if(Slices, [](const Slice &S) { return S.isDead(); }),
+      Slices.end());
 
 #ifndef NDEBUG
   if (SROARandomShuffleSlices) {
@@ -1820,11 +1873,12 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
   // do that until all the backends are known to produce good code for all
   // integer vector types.
   if (!HaveCommonEltTy) {
-    CandidateTys.erase(remove_if(CandidateTys,
-                                 [](VectorType *VTy) {
-                                   return !VTy->getElementType()->isIntegerTy();
-                                 }),
-                       CandidateTys.end());
+    CandidateTys.erase(
+        llvm::remove_if(CandidateTys,
+                        [](VectorType *VTy) {
+                          return !VTy->getElementType()->isIntegerTy();
+                        }),
+        CandidateTys.end());
 
     // If there were no integer vector types, give up.
     if (CandidateTys.empty())
@@ -2151,8 +2205,9 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
 class llvm::sroa::AllocaSliceRewriter
     : public InstVisitor<AllocaSliceRewriter, bool> {
   // Befriend the base class so it can delegate to private visit methods.
-  friend class llvm::InstVisitor<AllocaSliceRewriter, bool>;
-  typedef llvm::InstVisitor<AllocaSliceRewriter, bool> Base;
+  friend class InstVisitor<AllocaSliceRewriter, bool>;
+
+  using Base = InstVisitor<AllocaSliceRewriter, bool>;
 
   const DataLayout &DL;
   AllocaSlices &AS;
@@ -2182,16 +2237,18 @@ class llvm::sroa::AllocaSliceRewriter
 
   // The original offset of the slice currently being rewritten relative to
   // the original alloca.
-  uint64_t BeginOffset, EndOffset;
+  uint64_t BeginOffset = 0;
+  uint64_t EndOffset = 0;
+
   // The new offsets of the slice currently being rewritten relative to the
   // original alloca.
   uint64_t NewBeginOffset, NewEndOffset;
 
   uint64_t SliceSize;
-  bool IsSplittable;
-  bool IsSplit;
-  Use *OldUse;
-  Instruction *OldPtr;
+  bool IsSplittable = false;
+  bool IsSplit = false;
+  Use *OldUse = nullptr;
+  Instruction *OldPtr = nullptr;
 
   // Track post-rewrite users which are PHI nodes and Selects.
   SmallSetVector<PHINode *, 8> &PHIUsers;
@@ -2221,8 +2278,7 @@ public:
         VecTy(PromotableVecTy),
         ElementTy(VecTy ? VecTy->getElementType() : nullptr),
         ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0),
-        BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(),
-        OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers),
+        PHIUsers(PHIUsers), SelectUsers(SelectUsers),
         IRB(NewAI.getContext(), ConstantFolder()) {
     if (VecTy) {
       assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 &&
@@ -2987,6 +3043,7 @@ private:
 };
 
 namespace {
+
 /// \brief Visitor to rewrite aggregate loads and stores as scalar.
 ///
 /// This pass aggressively rewrites all aggregate loads and stores on
@@ -2994,7 +3051,7 @@ namespace {
 /// with scalar loads and stores.
 class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
   // Befriend the base class so it can delegate to private visit methods.
-  friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>;
+  friend class InstVisitor<AggLoadStoreRewriter, bool>;
 
   /// Queue of pointer uses to analyze and potentially rewrite.
   SmallVector<Use *, 8> Queue;
@@ -3037,12 +3094,15 @@ private:
   protected:
     /// The builder used to form new instructions.
     IRBuilderTy IRB;
+
     /// The indices which to be used with insert- or extractvalue to select the
     /// appropriate value within the aggregate.
     SmallVector<unsigned, 4> Indices;
+
     /// The indices to a GEP instruction which will move Ptr to the correct slot
     /// within the aggregate.
     SmallVector<Value *, 4> GEPIndices;
+
     /// The base pointer of the original op, used as a base for GEPing the
     /// split operations.
     Value *Ptr;
@@ -3193,7 +3253,8 @@ private:
     return false;
   }
 };
-}
+
+} // end anonymous namespace
 
 /// \brief Strip aggregate type wrapping.
 ///
@@ -3485,58 +3546,60 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
   // match relative to their starting offset. We have to verify this prior to
   // any rewriting.
   Stores.erase(
-      remove_if(Stores,
-                [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
-                  // Lookup the load we are storing in our map of split
-                  // offsets.
-                  auto *LI = cast<LoadInst>(SI->getValueOperand());
-                  // If it was completely unsplittable, then we're done,
-                  // and this store can't be pre-split.
-                  if (UnsplittableLoads.count(LI))
-                    return true;
+      llvm::remove_if(Stores,
+                      [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
+                        // Lookup the load we are storing in our map of split
+                        // offsets.
+                        auto *LI = cast<LoadInst>(SI->getValueOperand());
+                        // If it was completely unsplittable, then we're done,
+                        // and this store can't be pre-split.
+                        if (UnsplittableLoads.count(LI))
+                          return true;
 
-                  auto LoadOffsetsI = SplitOffsetsMap.find(LI);
-                  if (LoadOffsetsI == SplitOffsetsMap.end())
-                    return false; // Unrelated loads are definitely safe.
-                  auto &LoadOffsets = LoadOffsetsI->second;
+                        auto LoadOffsetsI = SplitOffsetsMap.find(LI);
+                        if (LoadOffsetsI == SplitOffsetsMap.end())
+                          return false; // Unrelated loads are definitely safe.
+                        auto &LoadOffsets = LoadOffsetsI->second;
 
-                  // Now lookup the store's offsets.
-                  auto &StoreOffsets = SplitOffsetsMap[SI];
+                        // Now lookup the store's offsets.
+                        auto &StoreOffsets = SplitOffsetsMap[SI];
 
-                  // If the relative offsets of each split in the load and
-                  // store match exactly, then we can split them and we
-                  // don't need to remove them here.
-                  if (LoadOffsets.Splits == StoreOffsets.Splits)
-                    return false;
+                        // If the relative offsets of each split in the load and
+                        // store match exactly, then we can split them and we
+                        // don't need to remove them here.
+                        if (LoadOffsets.Splits == StoreOffsets.Splits)
+                          return false;
 
-                  DEBUG(dbgs() << "    Mismatched splits for load and store:\n"
-                               << "      " << *LI << "\n"
-                               << "      " << *SI << "\n");
+                        DEBUG(dbgs()
+                              << "    Mismatched splits for load and store:\n"
+                              << "      " << *LI << "\n"
+                              << "      " << *SI << "\n");
 
-                  // We've found a store and load that we need to split
-                  // with mismatched relative splits. Just give up on them
-                  // and remove both instructions from our list of
-                  // candidates.
-                  UnsplittableLoads.insert(LI);
-                  return true;
-                }),
+                        // We've found a store and load that we need to split
+                        // with mismatched relative splits. Just give up on them
+                        // and remove both instructions from our list of
+                        // candidates.
+                        UnsplittableLoads.insert(LI);
+                        return true;
+                      }),
       Stores.end());
   // Now we have to go *back* through all the stores, because a later store may
   // have caused an earlier store's load to become unsplittable and if it is
   // unsplittable for the later store, then we can't rely on it being split in
   // the earlier store either.
-  Stores.erase(remove_if(Stores,
-                         [&UnsplittableLoads](StoreInst *SI) {
-                           auto *LI = cast<LoadInst>(SI->getValueOperand());
-                           return UnsplittableLoads.count(LI);
-                         }),
+  Stores.erase(llvm::remove_if(Stores,
+                               [&UnsplittableLoads](StoreInst *SI) {
+                                 auto *LI =
+                                     cast<LoadInst>(SI->getValueOperand());
+                                 return UnsplittableLoads.count(LI);
+                               }),
                Stores.end());
   // Once we've established all the loads that can't be split for some reason,
   // filter any that made it into our list out.
-  Loads.erase(remove_if(Loads,
-                        [&UnsplittableLoads](LoadInst *LI) {
-                          return UnsplittableLoads.count(LI);
-                        }),
+  Loads.erase(llvm::remove_if(Loads,
+                              [&UnsplittableLoads](LoadInst *LI) {
+                                return UnsplittableLoads.count(LI);
+                              }),
               Loads.end());
 
   // If no loads or stores are left, there is no pre-splitting to be done for
@@ -3804,7 +3867,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
   }
 
   // Remove the killed slices that have ben pre-split.
-  AS.erase(remove_if(AS, [](const Slice &S) { return S.isDead(); }), AS.end());
+  AS.erase(llvm::remove_if(AS, [](const Slice &S) { return S.isDead(); }),
+           AS.end());
 
   // Insert our new slices. This will sort and merge them into the sorted
   // sequence.
@@ -3819,7 +3883,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
   // Finally, don't try to promote any allocas that new require re-splitting.
   // They have already been added to the worklist above.
   PromotableAllocas.erase(
-      remove_if(
+      llvm::remove_if(
           PromotableAllocas,
           [&](AllocaInst *AI) { return ResplitPromotableAllocas.count(AI); }),
       PromotableAllocas.end());
@@ -3989,27 +4053,58 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
   // First try to pre-split loads and stores.
   Changed |= presplitLoadsAndStores(AI, AS);
 
-  // Now that we have identified any pre-splitting opportunities, mark any
-  // splittable (non-whole-alloca) loads and stores as unsplittable. If we fail
-  // to split these during pre-splitting, we want to force them to be
-  // rewritten into a partition.
+  // Now that we have identified any pre-splitting opportunities,
+  // mark loads and stores unsplittable except for the following case.
+  // We leave a slice splittable if all other slices are disjoint or fully
+  // included in the slice, such as whole-alloca loads and stores.
+  // If we fail to split these during pre-splitting, we want to force them
+  // to be rewritten into a partition.
   bool IsSorted = true;
-  for (Slice &S : AS) {
-    if (!S.isSplittable())
-      continue;
-    // FIXME: We currently leave whole-alloca splittable loads and stores. This
-    // used to be the only splittable loads and stores and we need to be
-    // confident that the above handling of splittable loads and stores is
-    // completely sufficient before we forcibly disable the remaining handling.
-    if (S.beginOffset() == 0 &&
-        S.endOffset() >= DL.getTypeAllocSize(AI.getAllocatedType()))
-      continue;
-    if (isa<LoadInst>(S.getUse()->getUser()) ||
-        isa<StoreInst>(S.getUse()->getUser())) {
-      S.makeUnsplittable();
-      IsSorted = false;
+
+  uint64_t AllocaSize = DL.getTypeAllocSize(AI.getAllocatedType());
+  const uint64_t MaxBitVectorSize = 1024;
+  if (SROASplitNonWholeAllocaSlices && AllocaSize <= MaxBitVectorSize) {
+    // If a byte boundary is included in any load or store, a slice starting or
+    // ending at the boundary is not splittable.
+    SmallBitVector SplittableOffset(AllocaSize + 1, true);
+    for (Slice &S : AS)
+      for (unsigned O = S.beginOffset() + 1;
+           O < S.endOffset() && O < AllocaSize; O++)
+        SplittableOffset.reset(O);
+
+    for (Slice &S : AS) {
+      if (!S.isSplittable())
+        continue;
+
+      if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) &&
+          (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()]))
+        continue;
+
+      if (isa<LoadInst>(S.getUse()->getUser()) ||
+          isa<StoreInst>(S.getUse()->getUser())) {
+        S.makeUnsplittable();
+        IsSorted = false;
+      }
     }
   }
+  else {
+    // We only allow whole-alloca splittable loads and stores
+    // for a large alloca to avoid creating too large BitVector.
+    for (Slice &S : AS) {
+      if (!S.isSplittable())
+        continue;
+
+      if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize)
+        continue;
+
+      if (isa<LoadInst>(S.getUse()->getUser()) ||
+          isa<StoreInst>(S.getUse()->getUser())) {
+        S.makeUnsplittable();
+        IsSorted = false;
+      }
+    }
+  }
+
   if (!IsSorted)
     std::sort(AS.begin(), AS.end());
 
@@ -4044,9 +4139,11 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
 
   // Migrate debug information from the old alloca to the new alloca(s)
   // and the individual partitions.
-  if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(&AI)) {
-    auto *Var = DbgDecl->getVariable();
-    auto *Expr = DbgDecl->getExpression();
+  TinyPtrVector<DbgInfoIntrinsic *> DbgDeclares = FindDbgAddrUses(&AI);
+  if (!DbgDeclares.empty()) {
+    auto *Var = DbgDeclares.front()->getVariable();
+    auto *Expr = DbgDeclares.front()->getExpression();
+    auto VarSize = Var->getSizeInBits();
     DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
     uint64_t AllocaSize = DL.getTypeSizeInBits(AI.getAllocatedType());
     for (auto Fragment : Fragments) {
@@ -4062,21 +4159,43 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
         uint64_t Size = Fragment.Size;
         if (ExprFragment) {
           uint64_t AbsEnd =
-	    ExprFragment->OffsetInBits + ExprFragment->SizeInBits;
+              ExprFragment->OffsetInBits + ExprFragment->SizeInBits;
           if (Start >= AbsEnd)
             // No need to describe a SROAed padding.
             continue;
           Size = std::min(Size, AbsEnd - Start);
         }
-        FragmentExpr = DIB.createFragmentExpression(Start, Size);
+        // The new, smaller fragment is stenciled out from the old fragment.
+        if (auto OrigFragment = FragmentExpr->getFragmentInfo()) {
+          assert(Start >= OrigFragment->OffsetInBits &&
+                 "new fragment is outside of original fragment");
+          Start -= OrigFragment->OffsetInBits;
+        }
+
+        // The alloca may be larger than the variable.
+        if (VarSize) {
+          if (Size > *VarSize)
+            Size = *VarSize;
+          if (Size == 0 || Start + Size > *VarSize)
+            continue;
+        }
+
+        // Avoid creating a fragment expression that covers the entire variable.
+        if (!VarSize || *VarSize != Size) {
+          if (auto E =
+                  DIExpression::createFragmentExpression(Expr, Start, Size))
+            FragmentExpr = *E;
+          else
+            continue;
+        }
       }
 
-      // Remove any existing dbg.declare intrinsic describing the same alloca.
-      if (DbgDeclareInst *OldDDI = FindAllocaDbgDeclare(Fragment.Alloca))
-        OldDDI->eraseFromParent();
+      // Remove any existing intrinsics describing the same alloca.
+      for (DbgInfoIntrinsic *OldDII : FindDbgAddrUses(Fragment.Alloca))
+        OldDII->eraseFromParent();
 
       DIB.insertDeclare(Fragment.Alloca, Var, FragmentExpr,
-                        DbgDecl->getDebugLoc(), &AI);
+                        DbgDeclares.front()->getDebugLoc(), &AI);
     }
   }
   return Changed;
@@ -4175,12 +4294,22 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
 ///
 /// We also record the alloca instructions deleted here so that they aren't
 /// subsequently handed to mem2reg to promote.
-void SROA::deleteDeadInstructions(
+bool SROA::deleteDeadInstructions(
     SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
+  bool Changed = false;
   while (!DeadInsts.empty()) {
     Instruction *I = DeadInsts.pop_back_val();
     DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
 
+    // If the instruction is an alloca, find the possible dbg.declare connected
+    // to it, and remove it too. We must do this before calling RAUW or we will
+    // not be able to find it.
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
+      DeletedAllocas.insert(AI);
+      for (DbgInfoIntrinsic *OldDII : FindDbgAddrUses(AI))
+        OldDII->eraseFromParent();
+    }
+
     I->replaceAllUsesWith(UndefValue::get(I->getType()));
 
     for (Use &Operand : I->operands())
@@ -4191,15 +4320,11 @@ void SROA::deleteDeadInstructions(
           DeadInsts.insert(U);
       }
 
-    if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
-      DeletedAllocas.insert(AI);
-      if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(AI))
-        DbgDecl->eraseFromParent();
-    }
-
     ++NumDeleted;
     I->eraseFromParent();
+    Changed = true;
   }
+  return Changed;
 }
 
 /// \brief Promote the allocas, using the best available technique.
@@ -4241,7 +4366,7 @@ PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
   do {
     while (!Worklist.empty()) {
       Changed |= runOnAlloca(*Worklist.pop_back_val());
-      deleteDeadInstructions(DeletedAllocas);
+      Changed |= deleteDeadInstructions(DeletedAllocas);
 
       // Remove the deleted allocas from various lists so that we don't try to
       // continue processing them.
@@ -4249,7 +4374,7 @@ PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
         auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); };
         Worklist.remove_if(IsInSet);
         PostPromotionWorklist.remove_if(IsInSet);
-        PromotableAllocas.erase(remove_if(PromotableAllocas, IsInSet),
+        PromotableAllocas.erase(llvm::remove_if(PromotableAllocas, IsInSet),
                                 PromotableAllocas.end());
         DeletedAllocas.clear();
       }
@@ -4284,9 +4409,12 @@ class llvm::sroa::SROALegacyPass : public FunctionPass {
   SROA Impl;
 
 public:
+  static char ID;
+
   SROALegacyPass() : FunctionPass(ID) {
     initializeSROALegacyPassPass(*PassRegistry::getPassRegistry());
   }
+
   bool runOnFunction(Function &F) override {
     if (skipFunction(F))
       return false;
@@ -4296,6 +4424,7 @@ public:
         getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F));
     return !PA.areAllPreserved();
   }
+
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<DominatorTreeWrapperPass>();
@@ -4304,7 +4433,6 @@ public:
   }
 
   StringRef getPassName() const override { return "SROA"; }
-  static char ID;
 };
 
 char SROALegacyPass::ID = 0;
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index ce6f93eb0c15..3b99ddff2e06 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -35,11 +35,13 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeADCELegacyPassPass(Registry);
   initializeBDCELegacyPassPass(Registry);
   initializeAlignmentFromAssumptionsPass(Registry);
+  initializeCallSiteSplittingLegacyPassPass(Registry);
   initializeConstantHoistingLegacyPassPass(Registry);
   initializeConstantPropagationPass(Registry);
   initializeCorrelatedValuePropagationPass(Registry);
   initializeDCELegacyPassPass(Registry);
   initializeDeadInstEliminationPass(Registry);
+  initializeDivRemPairsLegacyPassPass(Registry);
   initializeScalarizerPass(Registry);
   initializeDSELegacyPassPass(Registry);
   initializeGuardWideningLegacyPassPass(Registry);
@@ -73,17 +75,17 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLowerExpectIntrinsicPass(Registry);
   initializeLowerGuardIntrinsicLegacyPassPass(Registry);
   initializeMemCpyOptLegacyPassPass(Registry);
+  initializeMergeICmpsPass(Registry);
   initializeMergedLoadStoreMotionLegacyPassPass(Registry);
   initializeNaryReassociateLegacyPassPass(Registry);
   initializePartiallyInlineLibCallsLegacyPassPass(Registry);
   initializeReassociateLegacyPassPass(Registry);
   initializeRegToMemPass(Registry);
-  initializeRewriteStatepointsForGCPass(Registry);
+  initializeRewriteStatepointsForGCLegacyPassPass(Registry);
   initializeSCCPLegacyPassPass(Registry);
   initializeIPSCCPLegacyPassPass(Registry);
   initializeSROALegacyPassPass(Registry);
   initializeCFGSimplifyPassPass(Registry);
-  initializeLateCFGSimplifyPassPass(Registry);
   initializeStructurizeCFGPass(Registry);
   initializeSimpleLoopUnswitchLegacyPassPass(Registry);
   initializeSinkingLegacyPassPass(Registry);
@@ -98,6 +100,8 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLoopLoadEliminationPass(Registry);
   initializeLoopSimplifyCFGLegacyPassPass(Registry);
   initializeLoopVersioningPassPass(Registry);
+  initializeEntryExitInstrumenterPass(Registry);
+  initializePostInlineEntryExitInstrumenterPass(Registry);
 }
 
 void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) {
@@ -117,11 +121,7 @@ void LLVMAddAlignmentFromAssumptionsPass(LLVMPassManagerRef PM) {
 }
 
 void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createCFGSimplificationPass());
-}
-
-void LLVMAddLateCFGSimplificationPass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createLateCFGSimplificationPass());
+  unwrap(PM)->add(createCFGSimplificationPass(1, false, false, true));
 }
 
 void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM) {
diff --git a/lib/Transforms/Scalar/Scalarizer.cpp b/lib/Transforms/Scalar/Scalarizer.cpp
index d11855f2f3a9..34ed126155be 100644
--- a/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/lib/Transforms/Scalar/Scalarizer.cpp
@@ -1,4 +1,4 @@
-//===--- Scalarizer.cpp - Scalarize vector operations ---------------------===//
+//===- Scalarizer.cpp - Scalarize vector operations -----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,36 +14,59 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Options.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <map>
+#include <utility>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "scalarizer"
 
 namespace {
+
 // Used to store the scattered form of a vector.
-typedef SmallVector<Value *, 8> ValueVector;
+using ValueVector = SmallVector<Value *, 8>;
 
 // Used to map a vector Value to its scattered form.  We use std::map
 // because we want iterators to persist across insertion and because the
 // values are relatively large.
-typedef std::map<Value *, ValueVector> ScatterMap;
+using ScatterMap = std::map<Value *, ValueVector>;
 
 // Lists Instructions that have been replaced with scalar implementations,
 // along with a pointer to their scattered forms.
-typedef SmallVector<std::pair<Instruction *, ValueVector *>, 16> GatherList;
+using GatherList = SmallVector<std::pair<Instruction *, ValueVector *>, 16>;
 
 // Provides a very limited vector-like interface for lazily accessing one
 // component of a scattered vector or vector pointer.
 class Scatterer {
 public:
-  Scatterer() {}
+  Scatterer() = default;
 
   // Scatter V into Size components.  If new instructions are needed,
   // insert them before BBI in BB.  If Cache is nonnull, use it to cache
@@ -71,10 +94,12 @@ private:
 // called Name that compares X and Y in the same way as FCI.
 struct FCmpSplitter {
   FCmpSplitter(FCmpInst &fci) : FCI(fci) {}
+
   Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
                     const Twine &Name) const {
     return Builder.CreateFCmp(FCI.getPredicate(), Op0, Op1, Name);
   }
+
   FCmpInst &FCI;
 };
 
@@ -82,10 +107,12 @@ struct FCmpSplitter {
 // called Name that compares X and Y in the same way as ICI.
 struct ICmpSplitter {
   ICmpSplitter(ICmpInst &ici) : ICI(ici) {}
+
   Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
                     const Twine &Name) const {
     return Builder.CreateICmp(ICI.getPredicate(), Op0, Op1, Name);
   }
+
   ICmpInst &ICI;
 };
 
@@ -93,16 +120,18 @@ struct ICmpSplitter {
 // a binary operator like BO called Name with operands X and Y.
 struct BinarySplitter {
   BinarySplitter(BinaryOperator &bo) : BO(bo) {}
+
   Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
                     const Twine &Name) const {
     return Builder.CreateBinOp(BO.getOpcode(), Op0, Op1, Name);
   }
+
   BinaryOperator &BO;
 };
 
 // Information about a load or store that we're scalarizing.
 struct VectorLayout {
-  VectorLayout() : VecTy(nullptr), ElemTy(nullptr), VecAlign(0), ElemSize(0) {}
+  VectorLayout() = default;
 
   // Return the alignment of element I.
   uint64_t getElemAlign(unsigned I) {
@@ -110,16 +139,16 @@ struct VectorLayout {
   }
 
   // The type of the vector.
-  VectorType *VecTy;
+  VectorType *VecTy = nullptr;
 
   // The type of each element.
-  Type *ElemTy;
+  Type *ElemTy = nullptr;
 
   // The alignment of the vector.
-  uint64_t VecAlign;
+  uint64_t VecAlign = 0;
 
   // The size of each element.
-  uint64_t ElemSize;
+  uint64_t ElemSize = 0;
 };
 
 class Scalarizer : public FunctionPass,
@@ -127,8 +156,7 @@ class Scalarizer : public FunctionPass,
 public:
   static char ID;
 
-  Scalarizer() :
-    FunctionPass(ID) {
+  Scalarizer() : FunctionPass(ID) {
     initializeScalarizerPass(*PassRegistry::getPassRegistry());
   }
 
@@ -137,19 +165,19 @@ public:
 
   // InstVisitor methods.  They return true if the instruction was scalarized,
   // false if nothing changed.
-  bool visitInstruction(Instruction &) { return false; }
+  bool visitInstruction(Instruction &I) { return false; }
   bool visitSelectInst(SelectInst &SI);
-  bool visitICmpInst(ICmpInst &);
-  bool visitFCmpInst(FCmpInst &);
-  bool visitBinaryOperator(BinaryOperator &);
-  bool visitGetElementPtrInst(GetElementPtrInst &);
-  bool visitCastInst(CastInst &);
-  bool visitBitCastInst(BitCastInst &);
-  bool visitShuffleVectorInst(ShuffleVectorInst &);
-  bool visitPHINode(PHINode &);
-  bool visitLoadInst(LoadInst &);
-  bool visitStoreInst(StoreInst &);
-  bool visitCallInst(CallInst &I);
+  bool visitICmpInst(ICmpInst &ICI);
+  bool visitFCmpInst(FCmpInst &FCI);
+  bool visitBinaryOperator(BinaryOperator &BO);
+  bool visitGetElementPtrInst(GetElementPtrInst &GEPI);
+  bool visitCastInst(CastInst &CI);
+  bool visitBitCastInst(BitCastInst &BCI);
+  bool visitShuffleVectorInst(ShuffleVectorInst &SVI);
+  bool visitPHINode(PHINode &PHI);
+  bool visitLoadInst(LoadInst &LI);
+  bool visitStoreInst(StoreInst &SI);
+  bool visitCallInst(CallInst &ICI);
 
   static void registerOptions() {
     // This is disabled by default because having separate loads and stores
@@ -162,11 +190,12 @@ public:
   }
 
 private:
-  Scatterer scatter(Instruction *, Value *);
-  void gather(Instruction *, const ValueVector &);
+  Scatterer scatter(Instruction *Point, Value *V);
+  void gather(Instruction *Op, const ValueVector &CV);
   bool canTransferMetadata(unsigned Kind);
-  void transferMetadata(Instruction *, const ValueVector &);
-  bool getVectorLayout(Type *, unsigned, VectorLayout &, const DataLayout &);
+  void transferMetadata(Instruction *Op, const ValueVector &CV);
+  bool getVectorLayout(Type *Ty, unsigned Alignment, VectorLayout &Layout,
+                       const DataLayout &DL);
   bool finish();
 
   template<typename T> bool splitBinary(Instruction &, const T &);
@@ -179,9 +208,10 @@ private:
   bool ScalarizeLoadStore;
 };
 
-char Scalarizer::ID = 0;
 } // end anonymous namespace
 
+char Scalarizer::ID = 0;
+
 INITIALIZE_PASS_WITH_OPTIONS(Scalarizer, "scalarizer",
                              "Scalarize vector operations", false, false)
 
@@ -222,7 +252,7 @@ Value *Scatterer::operator[](unsigned I) {
     // Search through a chain of InsertElementInsts looking for element I.
     // Record other elements in the cache.  The new V is still suitable
     // for all uncached indices.
-    for (;;) {
+    while (true) {
       InsertElementInst *Insert = dyn_cast<InsertElementInst>(V);
       if (!Insert)
         break;
diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 84675f41cdd5..209821ff21d7 100644
--- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -1,4 +1,4 @@
-//===-- SeparateConstOffsetFromGEP.cpp - ------------------------*- C++ -*-===//
+//===- SeparateConstOffsetFromGEP.cpp -------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -156,27 +156,44 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <cstdint>
+#include <string>
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
@@ -185,6 +202,7 @@ static cl::opt<bool> DisableSeparateConstOffsetFromGEP(
     "disable-separate-const-offset-from-gep", cl::init(false),
     cl::desc("Do not separate the constant offset from a GEP instruction"),
     cl::Hidden);
+
 // Setting this flag may emit false positives when the input module already
 // contains dead instructions. Therefore, we set it only in unit tests that are
 // free of dead code.
@@ -219,6 +237,7 @@ public:
   ///                  garbage-collect unused instructions in UserChain.
   static Value *Extract(Value *Idx, GetElementPtrInst *GEP,
                         User *&UserChainTail, const DominatorTree *DT);
+
   /// Looks for a constant offset from the given GEP index without extracting
   /// it. It returns the numeric value of the extracted constant offset (0 if
   /// failed). The meaning of the arguments are the same as Extract.
@@ -229,6 +248,7 @@ private:
   ConstantOffsetExtractor(Instruction *InsertionPt, const DominatorTree *DT)
       : IP(InsertionPt), DL(InsertionPt->getModule()->getDataLayout()), DT(DT) {
   }
+
   /// Searches the expression that computes V for a non-zero constant C s.t.
   /// V can be reassociated into the form V' + C. If the searching is
   /// successful, returns C and update UserChain as a def-use chain from C to V;
@@ -244,9 +264,11 @@ private:
   ///                 non-negative. Levaraging this, we can better split
   ///                 inbounds GEPs.
   APInt find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative);
+
   /// A helper function to look into both operands of a binary operator.
   APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended,
                             bool ZeroExtended);
+
   /// After finding the constant offset C from the GEP index I, we build a new
   /// index I' s.t. I' + C = I. This function builds and returns the new
   /// index I' according to UserChain produced by function "find".
@@ -263,6 +285,7 @@ private:
   ///   (sext(a) + sext(b)) + 5.
   /// Given this form, we know I' is sext(a) + sext(b).
   Value *rebuildWithoutConstOffset();
+
   /// After the first step of rebuilding the GEP index without the constant
   /// offset, distribute s/zext to the operands of all operators in UserChain.
   /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) =>
@@ -279,8 +302,10 @@ private:
   ///               UserChain.size() - 1, and is decremented during
   ///               the recursion.
   Value *distributeExtsAndCloneChain(unsigned ChainIndex);
+
   /// Reassociates the GEP index to the form I' + C and returns I'.
   Value *removeConstOffset(unsigned ChainIndex);
+
   /// A helper function to apply ExtInsts, a list of s/zext, to value V.
   /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function
   /// returns "sext i32 (zext i16 V to i32) to i64".
@@ -303,10 +328,14 @@ private:
   ///
   /// This path helps to rebuild the new GEP index.
   SmallVector<User *, 8> UserChain;
+
   /// A data structure used in rebuildWithoutConstOffset. Contains all
   /// sext/zext instructions along UserChain.
   SmallVector<CastInst *, 16> ExtInsts;
-  Instruction *IP;  /// Insertion position of cloned instructions.
+
+  /// Insertion position of cloned instructions.
+  Instruction *IP;
+
   const DataLayout &DL;
   const DominatorTree *DT;
 };
@@ -317,9 +346,10 @@ private:
 class SeparateConstOffsetFromGEP : public FunctionPass {
 public:
   static char ID;
+
   SeparateConstOffsetFromGEP(const TargetMachine *TM = nullptr,
                              bool LowerGEP = false)
-      : FunctionPass(ID), DL(nullptr), DT(nullptr), TM(TM), LowerGEP(LowerGEP) {
+      : FunctionPass(ID), TM(TM), LowerGEP(LowerGEP) {
     initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry());
   }
 
@@ -336,12 +366,14 @@ public:
     DL = &M.getDataLayout();
     return false;
   }
+
   bool runOnFunction(Function &F) override;
 
 private:
   /// Tries to split the given GEP into a variadic base and a constant offset,
   /// and returns true if the splitting succeeds.
   bool splitGEP(GetElementPtrInst *GEP);
+
   /// Lower a GEP with multiple indices into multiple GEPs with a single index.
   /// Function splitGEP already split the original GEP into a variadic part and
   /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
@@ -351,6 +383,7 @@ private:
   /// \p AccumulativeByteOffset    The constant offset.
   void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic,
                               int64_t AccumulativeByteOffset);
+
   /// Lower a GEP with multiple indices into ptrtoint+arithmetics+inttoptr form.
   /// Function splitGEP already split the original GEP into a variadic part and
   /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
@@ -360,12 +393,14 @@ private:
   /// \p AccumulativeByteOffset    The constant offset.
   void lowerToArithmetics(GetElementPtrInst *Variadic,
                           int64_t AccumulativeByteOffset);
+
   /// Finds the constant offset within each index and accumulates them. If
   /// LowerGEP is true, it finds in indices of both sequential and structure
   /// types, otherwise it only finds in sequential indices. The output
   /// NeedsExtraction indicates whether we successfully find a non-zero constant
   /// offset.
   int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction);
+
   /// Canonicalize array indices to pointer-size integers. This helps to
   /// simplify the logic of splitting a GEP. For example, if a + b is a
   /// pointer-size integer, we have
@@ -382,6 +417,7 @@ private:
   ///
   /// Verified in @i32_add in split-gep.ll
   bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP);
+
   /// Optimize sext(a)+sext(b) to sext(a+b) when a+b can't sign overflow.
   /// SeparateConstOffsetFromGEP distributes a sext to leaves before extracting
   /// the constant offset. After extraction, it becomes desirable to reunion the
@@ -392,8 +428,10 @@ private:
   ///   => constant extraction     &a[sext(i) + sext(j)] + 5
   ///   => reunion                 &a[sext(i +nsw j)] + 5
   bool reuniteExts(Function &F);
+
   /// A helper that reunites sexts in an instruction.
   bool reuniteExts(Instruction *I);
+
   /// Find the closest dominator of <Dominatee> that is equivalent to <Key>.
   Instruction *findClosestMatchingDominator(const SCEV *Key,
                                             Instruction *Dominatee);
@@ -401,27 +439,33 @@ private:
   void verifyNoDeadCode(Function &F);
 
   bool hasMoreThanOneUseInLoop(Value *v, Loop *L);
+
   // Swap the index operand of two GEP.
   void swapGEPOperand(GetElementPtrInst *First, GetElementPtrInst *Second);
+
   // Check if it is safe to swap operand of two GEP.
   bool isLegalToSwapOperand(GetElementPtrInst *First, GetElementPtrInst *Second,
                             Loop *CurLoop);
 
-  const DataLayout *DL;
-  DominatorTree *DT;
+  const DataLayout *DL = nullptr;
+  DominatorTree *DT = nullptr;
   ScalarEvolution *SE;
   const TargetMachine *TM;
 
   LoopInfo *LI;
   TargetLibraryInfo *TLI;
+
   /// Whether to lower a GEP with multiple indices into arithmetic operations or
   /// multiple GEPs with a single index.
   bool LowerGEP;
+
   DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingExprs;
 };
-}  // anonymous namespace
+
+} // end anonymous namespace
 
 char SeparateConstOffsetFromGEP::ID = 0;
+
 INITIALIZE_PASS_BEGIN(
     SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
     "Split GEPs to a variadic base and a constant offset for better CSE", false,
diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index aaab5857e0f1..3d0fca0bc3a5 100644
--- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -28,6 +29,7 @@
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
@@ -36,11 +38,15 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GenericDomTree.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
 #include <cassert>
 #include <iterator>
+#include <numeric>
 #include <utility>
 
 #define DEBUG_TYPE "simple-loop-unswitch"
@@ -51,6 +57,15 @@ STATISTIC(NumBranches, "Number of branches unswitched");
 STATISTIC(NumSwitches, "Number of switches unswitched");
 STATISTIC(NumTrivial, "Number of unswitches that are trivial");
 
+static cl::opt<bool> EnableNonTrivialUnswitch(
+    "enable-nontrivial-unswitch", cl::init(false), cl::Hidden,
+    cl::desc("Forcibly enables non-trivial loop unswitching rather than "
+             "following the configuration passed into the pass."));
+
+static cl::opt<int>
+    UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden,
+                      cl::desc("The cost threshold for unswitching a loop."));
+
 static void replaceLoopUsesWithConstant(Loop &L, Value &LIC,
                                         Constant &Replacement) {
   assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?");
@@ -68,24 +83,95 @@ static void replaceLoopUsesWithConstant(Loop &L, Value &LIC,
   }
 }
 
-/// Update the dominator tree after removing one exiting predecessor of a loop
-/// exit block.
-static void updateLoopExitIDom(BasicBlock *LoopExitBB, Loop &L,
-                               DominatorTree &DT) {
-  assert(pred_begin(LoopExitBB) != pred_end(LoopExitBB) &&
-         "Cannot have empty predecessors of the loop exit block if we split "
-         "off a block to unswitch!");
+/// Update the IDom for a basic block whose predecessor set has changed.
+///
+/// This routine is designed to work when the domtree update is relatively
+/// localized by leveraging a known common dominator, often a loop header.
+///
+/// FIXME: Should consider hand-rolling a slightly more efficient non-DFS
+/// approach here as we can do that easily by persisting the candidate IDom's
+/// dominating set between each predecessor.
+///
+/// FIXME: Longer term, many uses of this can be replaced by an incremental
+/// domtree update strategy that starts from a known dominating block and
+/// rebuilds that subtree.
+static bool updateIDomWithKnownCommonDominator(BasicBlock *BB,
+                                               BasicBlock *KnownDominatingBB,
+                                               DominatorTree &DT) {
+  assert(pred_begin(BB) != pred_end(BB) &&
+         "This routine does not handle unreachable blocks!");
+
+  BasicBlock *OrigIDom = DT[BB]->getIDom()->getBlock();
+
+  BasicBlock *IDom = *pred_begin(BB);
+  assert(DT.dominates(KnownDominatingBB, IDom) &&
+         "Bad known dominating block!");
 
-  BasicBlock *IDom = *pred_begin(LoopExitBB);
   // Walk all of the other predecessors finding the nearest common dominator
   // until all predecessors are covered or we reach the loop header. The loop
   // header necessarily dominates all loop exit blocks in loop simplified form
   // so we can early-exit the moment we hit that block.
-  for (auto PI = std::next(pred_begin(LoopExitBB)), PE = pred_end(LoopExitBB);
-       PI != PE && IDom != L.getHeader(); ++PI)
+  for (auto PI = std::next(pred_begin(BB)), PE = pred_end(BB);
+       PI != PE && IDom != KnownDominatingBB; ++PI) {
+    assert(DT.dominates(KnownDominatingBB, *PI) &&
+           "Bad known dominating block!");
     IDom = DT.findNearestCommonDominator(IDom, *PI);
+  }
+
+  if (IDom == OrigIDom)
+    return false;
+
+  DT.changeImmediateDominator(BB, IDom);
+  return true;
+}
+
+// Note that we don't currently use the IDFCalculator here for two reasons:
+// 1) It computes dominator tree levels for the entire function on each run
+//    of 'compute'. While this isn't terrible, given that we expect to update
+//    relatively small subtrees of the domtree, it isn't necessarily the right
+//    tradeoff.
+// 2) The interface doesn't fit this usage well. It doesn't operate in
+//    append-only, and builds several sets that we don't need.
+//
+// FIXME: Neither of these issues are a big deal and could be addressed with
+// some amount of refactoring of IDFCalculator. That would allow us to share
+// the core logic here (which is solving the same core problem).
+static void appendDomFrontier(DomTreeNode *Node,
+                              SmallSetVector<BasicBlock *, 4> &Worklist,
+                              SmallVectorImpl<DomTreeNode *> &DomNodes,
+                              SmallPtrSetImpl<BasicBlock *> &DomSet) {
+  assert(DomNodes.empty() && "Must start with no dominator nodes.");
+  assert(DomSet.empty() && "Must start with an empty dominator set.");
+
+  // First flatten this subtree into sequence of nodes by doing a pre-order
+  // walk.
+  DomNodes.push_back(Node);
+  // We intentionally re-evaluate the size as each node can add new children.
+  // Because this is a tree walk, this cannot add any duplicates.
+  for (int i = 0; i < (int)DomNodes.size(); ++i)
+    DomNodes.insert(DomNodes.end(), DomNodes[i]->begin(), DomNodes[i]->end());
+
+  // Now create a set of the basic blocks so we can quickly test for
+  // dominated successors. We could in theory use the DFS numbers of the
+  // dominator tree for this, but we want this to remain predictably fast
+  // even while we mutate the dominator tree in ways that would invalidate
+  // the DFS numbering.
+  for (DomTreeNode *InnerN : DomNodes)
+    DomSet.insert(InnerN->getBlock());
 
-  DT.changeImmediateDominator(LoopExitBB, IDom);
+  // Now re-walk the nodes, appending every successor of every node that isn't
+  // in the set. Note that we don't append the node itself, even though if it
+  // is a successor it does not strictly dominate itself and thus it would be
+  // part of the dominance frontier. The reason we don't append it is that
+  // the node passed in came *from* the worklist and so it has already been
+  // processed.
+  for (DomTreeNode *InnerN : DomNodes)
+    for (BasicBlock *SuccBB : successors(InnerN->getBlock()))
+      if (!DomSet.count(SuccBB))
+        Worklist.insert(SuccBB);
+
+  DomNodes.clear();
+  DomSet.clear();
 }
 
 /// Update the dominator tree after unswitching a particular former exit block.
@@ -127,58 +213,14 @@ static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH,
   // dominator frontier to see if it additionally should move up the dominator
   // tree. This lambda appends the dominator frontier for a node on the
   // worklist.
-  //
-  // Note that we don't currently use the IDFCalculator here for two reasons:
-  // 1) It computes dominator tree levels for the entire function on each run
-  //    of 'compute'. While this isn't terrible, given that we expect to update
-  //    relatively small subtrees of the domtree, it isn't necessarily the right
-  //    tradeoff.
-  // 2) The interface doesn't fit this usage well. It doesn't operate in
-  //    append-only, and builds several sets that we don't need.
-  //
-  // FIXME: Neither of these issues are a big deal and could be addressed with
-  // some amount of refactoring of IDFCalculator. That would allow us to share
-  // the core logic here (which is solving the same core problem).
   SmallSetVector<BasicBlock *, 4> Worklist;
+
+  // Scratch data structures reused by domfrontier finding.
   SmallVector<DomTreeNode *, 4> DomNodes;
   SmallPtrSet<BasicBlock *, 4> DomSet;
-  auto AppendDomFrontier = [&](DomTreeNode *Node) {
-    assert(DomNodes.empty() && "Must start with no dominator nodes.");
-    assert(DomSet.empty() && "Must start with an empty dominator set.");
-
-    // First flatten this subtree into sequence of nodes by doing a pre-order
-    // walk.
-    DomNodes.push_back(Node);
-    // We intentionally re-evaluate the size as each node can add new children.
-    // Because this is a tree walk, this cannot add any duplicates.
-    for (int i = 0; i < (int)DomNodes.size(); ++i)
-      DomNodes.insert(DomNodes.end(), DomNodes[i]->begin(), DomNodes[i]->end());
-
-    // Now create a set of the basic blocks so we can quickly test for
-    // dominated successors. We could in theory use the DFS numbers of the
-    // dominator tree for this, but we want this to remain predictably fast
-    // even while we mutate the dominator tree in ways that would invalidate
-    // the DFS numbering.
-    for (DomTreeNode *InnerN : DomNodes)
-      DomSet.insert(InnerN->getBlock());
-
-    // Now re-walk the nodes, appending every successor of every node that isn't
-    // in the set. Note that we don't append the node itself, even though if it
-    // is a successor it does not strictly dominate itself and thus it would be
-    // part of the dominance frontier. The reason we don't append it is that
-    // the node passed in came *from* the worklist and so it has already been
-    // processed.
-    for (DomTreeNode *InnerN : DomNodes)
-      for (BasicBlock *SuccBB : successors(InnerN->getBlock()))
-        if (!DomSet.count(SuccBB))
-          Worklist.insert(SuccBB);
-
-    DomNodes.clear();
-    DomSet.clear();
-  };
 
   // Append the initial dom frontier nodes.
-  AppendDomFrontier(UnswitchedNode);
+  appendDomFrontier(UnswitchedNode, Worklist, DomNodes, DomSet);
 
   // Walk the worklist. We grow the list in the loop and so must recompute size.
   for (int i = 0; i < (int)Worklist.size(); ++i) {
@@ -197,7 +239,7 @@ static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH,
     DT.changeImmediateDominator(Node, OldPHNode);
 
     // Now add this node's dominator frontier to the worklist as well.
-    AppendDomFrontier(Node);
+    appendDomFrontier(Node, Worklist, DomNodes, DomSet);
   }
 }
 
@@ -395,7 +437,7 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
   // one of the predecessors for the loop exit block and may need to update its
   // idom.
   if (UnswitchedBB != LoopExitBB)
-    updateLoopExitIDom(LoopExitBB, L, DT);
+    updateIDomWithKnownCommonDominator(LoopExitBB, L.getHeader(), DT);
 
   // Since this is an i1 condition we can also trivially replace uses of it
   // within the loop with a constant.
@@ -540,7 +582,7 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
           SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI);
       rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB,
                                                 *ParentBB, *OldPH);
-      updateLoopExitIDom(DefaultExitBB, L, DT);
+      updateIDomWithKnownCommonDominator(DefaultExitBB, L.getHeader(), DT);
       DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB;
     }
   }
@@ -567,7 +609,7 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
       SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI);
       rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB,
                                                 *ParentBB, *OldPH);
-      updateLoopExitIDom(ExitBB, L, DT);
+      updateIDomWithKnownCommonDominator(ExitBB, L.getHeader(), DT);
     }
     // Update the case pair to point to the split block.
     CasePair.second = SplitExitBB;
@@ -708,15 +750,1172 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
   return Changed;
 }
 
+/// Build the cloned blocks for an unswitched copy of the given loop.
+///
+/// The cloned blocks are inserted before the loop preheader (`LoopPH`) and
+/// after the split block (`SplitBB`) that will be used to select between the
+/// cloned and original loop.
+///
+/// This routine handles cloning all of the necessary loop blocks and exit
+/// blocks including rewriting their instructions and the relevant PHI nodes.
+/// It skips loop and exit blocks that are not necessary based on the provided
+/// set. It also correctly creates the unconditional branch in the cloned
+/// unswitched parent block to only point at the unswitched successor.
+///
+/// This does not handle most of the necessary updates to `LoopInfo`. Only exit
+/// block splitting is correctly reflected in `LoopInfo`, essentially all of
+/// the cloned blocks (and their loops) are left without full `LoopInfo`
+/// updates. This also doesn't fully update `DominatorTree`. It adds the cloned
+/// blocks to them but doesn't create the cloned `DominatorTree` structure and
+/// instead the caller must recompute an accurate DT. It *does* correctly
+/// update the `AssumptionCache` provided in `AC`.
+static BasicBlock *buildClonedLoopBlocks(
+    Loop &L, BasicBlock *LoopPH, BasicBlock *SplitBB,
+    ArrayRef<BasicBlock *> ExitBlocks, BasicBlock *ParentBB,
+    BasicBlock *UnswitchedSuccBB, BasicBlock *ContinueSuccBB,
+    const SmallPtrSetImpl<BasicBlock *> &SkippedLoopAndExitBlocks,
+    ValueToValueMapTy &VMap, AssumptionCache &AC, DominatorTree &DT,
+    LoopInfo &LI) {
+  SmallVector<BasicBlock *, 4> NewBlocks;
+  NewBlocks.reserve(L.getNumBlocks() + ExitBlocks.size());
+
+  // We will need to clone a bunch of blocks, wrap up the clone operation in
+  // a helper.
+  auto CloneBlock = [&](BasicBlock *OldBB) {
+    // Clone the basic block and insert it before the new preheader.
+    BasicBlock *NewBB = CloneBasicBlock(OldBB, VMap, ".us", OldBB->getParent());
+    NewBB->moveBefore(LoopPH);
+
+    // Record this block and the mapping.
+    NewBlocks.push_back(NewBB);
+    VMap[OldBB] = NewBB;
+
+    // Add the block to the domtree. We'll move it to the correct position
+    // below.
+    DT.addNewBlock(NewBB, SplitBB);
+
+    return NewBB;
+  };
+
+  // First, clone the preheader.
+  auto *ClonedPH = CloneBlock(LoopPH);
+
+  // Then clone all the loop blocks, skipping the ones that aren't necessary.
+  for (auto *LoopBB : L.blocks())
+    if (!SkippedLoopAndExitBlocks.count(LoopBB))
+      CloneBlock(LoopBB);
+
+  // Split all the loop exit edges so that when we clone the exit blocks, if
+  // any of the exit blocks are *also* a preheader for some other loop, we
+  // don't create multiple predecessors entering the loop header.
+  for (auto *ExitBB : ExitBlocks) {
+    if (SkippedLoopAndExitBlocks.count(ExitBB))
+      continue;
+
+    // When we are going to clone an exit, we don't need to clone all the
+    // instructions in the exit block and we want to ensure we have an easy
+    // place to merge the CFG, so split the exit first. This is always safe to
+    // do because there cannot be any non-loop predecessors of a loop exit in
+    // loop simplified form.
+    auto *MergeBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI);
+
+    // Rearrange the names to make it easier to write test cases by having the
+    // exit block carry the suffix rather than the merge block carrying the
+    // suffix.
+    MergeBB->takeName(ExitBB);
+    ExitBB->setName(Twine(MergeBB->getName()) + ".split");
+
+    // Now clone the original exit block.
+    auto *ClonedExitBB = CloneBlock(ExitBB);
+    assert(ClonedExitBB->getTerminator()->getNumSuccessors() == 1 &&
+           "Exit block should have been split to have one successor!");
+    assert(ClonedExitBB->getTerminator()->getSuccessor(0) == MergeBB &&
+           "Cloned exit block has the wrong successor!");
+
+    // Move the merge block's idom to be the split point as one exit is
+    // dominated by one header, and the other by another, so we know the split
+    // point dominates both. While the dominator tree isn't fully accurate, we
+    // want sub-trees within the original loop to be correctly reflect
+    // dominance within that original loop (at least) and that requires moving
+    // the merge block out of that subtree.
+    // FIXME: This is very brittle as we essentially have a partial contract on
+    // the dominator tree. We really need to instead update it and keep it
+    // valid or stop relying on it.
+    DT.changeImmediateDominator(MergeBB, SplitBB);
+
+    // Remap any cloned instructions and create a merge phi node for them.
+    for (auto ZippedInsts : llvm::zip_first(
+             llvm::make_range(ExitBB->begin(), std::prev(ExitBB->end())),
+             llvm::make_range(ClonedExitBB->begin(),
+                              std::prev(ClonedExitBB->end())))) {
+      Instruction &I = std::get<0>(ZippedInsts);
+      Instruction &ClonedI = std::get<1>(ZippedInsts);
+
+      // The only instructions in the exit block should be PHI nodes and
+      // potentially a landing pad.
+      assert(
+          (isa<PHINode>(I) || isa<LandingPadInst>(I) || isa<CatchPadInst>(I)) &&
+          "Bad instruction in exit block!");
+      // We should have a value map between the instruction and its clone.
+      assert(VMap.lookup(&I) == &ClonedI && "Mismatch in the value map!");
+
+      auto *MergePN =
+          PHINode::Create(I.getType(), /*NumReservedValues*/ 2, ".us-phi",
+                          &*MergeBB->getFirstInsertionPt());
+      I.replaceAllUsesWith(MergePN);
+      MergePN->addIncoming(&I, ExitBB);
+      MergePN->addIncoming(&ClonedI, ClonedExitBB);
+    }
+  }
+
+  // Rewrite the instructions in the cloned blocks to refer to the instructions
+  // in the cloned blocks. We have to do this as a second pass so that we have
+  // everything available. Also, we have inserted new instructions which may
+  // include assume intrinsics, so we update the assumption cache while
+  // processing this.
+  for (auto *ClonedBB : NewBlocks)
+    for (Instruction &I : *ClonedBB) {
+      RemapInstruction(&I, VMap,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+      if (auto *II = dyn_cast<IntrinsicInst>(&I))
+        if (II->getIntrinsicID() == Intrinsic::assume)
+          AC.registerAssumption(II);
+    }
+
+  // Remove the cloned parent as a predecessor of the cloned continue successor
+  // if we did in fact clone it.
+  auto *ClonedParentBB = cast<BasicBlock>(VMap.lookup(ParentBB));
+  if (auto *ClonedContinueSuccBB =
+          cast_or_null<BasicBlock>(VMap.lookup(ContinueSuccBB)))
+    ClonedContinueSuccBB->removePredecessor(ClonedParentBB,
+                                            /*DontDeleteUselessPHIs*/ true);
+  // Replace the cloned branch with an unconditional branch to the cloneed
+  // unswitched successor.
+  auto *ClonedSuccBB = cast<BasicBlock>(VMap.lookup(UnswitchedSuccBB));
+  ClonedParentBB->getTerminator()->eraseFromParent();
+  BranchInst::Create(ClonedSuccBB, ClonedParentBB);
+
+  // Update any PHI nodes in the cloned successors of the skipped blocks to not
+  // have spurious incoming values.
+  for (auto *LoopBB : L.blocks())
+    if (SkippedLoopAndExitBlocks.count(LoopBB))
+      for (auto *SuccBB : successors(LoopBB))
+        if (auto *ClonedSuccBB = cast_or_null<BasicBlock>(VMap.lookup(SuccBB)))
+          for (PHINode &PN : ClonedSuccBB->phis())
+            PN.removeIncomingValue(LoopBB, /*DeletePHIIfEmpty*/ false);
+
+  return ClonedPH;
+}
+
+/// Recursively clone the specified loop and all of its children.
+///
+/// The target parent loop for the clone should be provided, or can be null if
+/// the clone is a top-level loop. While cloning, all the blocks are mapped
+/// with the provided value map. The entire original loop must be present in
+/// the value map. The cloned loop is returned.
+static Loop *cloneLoopNest(Loop &OrigRootL, Loop *RootParentL,
+                           const ValueToValueMapTy &VMap, LoopInfo &LI) {
+  auto AddClonedBlocksToLoop = [&](Loop &OrigL, Loop &ClonedL) {
+    assert(ClonedL.getBlocks().empty() && "Must start with an empty loop!");
+    ClonedL.reserveBlocks(OrigL.getNumBlocks());
+    for (auto *BB : OrigL.blocks()) {
+      auto *ClonedBB = cast<BasicBlock>(VMap.lookup(BB));
+      ClonedL.addBlockEntry(ClonedBB);
+      if (LI.getLoopFor(BB) == &OrigL) {
+        assert(!LI.getLoopFor(ClonedBB) &&
+               "Should not have an existing loop for this block!");
+        LI.changeLoopFor(ClonedBB, &ClonedL);
+      }
+    }
+  };
+
+  // We specially handle the first loop because it may get cloned into
+  // a different parent and because we most commonly are cloning leaf loops.
+  Loop *ClonedRootL = LI.AllocateLoop();
+  if (RootParentL)
+    RootParentL->addChildLoop(ClonedRootL);
+  else
+    LI.addTopLevelLoop(ClonedRootL);
+  AddClonedBlocksToLoop(OrigRootL, *ClonedRootL);
+
+  if (OrigRootL.empty())
+    return ClonedRootL;
+
+  // If we have a nest, we can quickly clone the entire loop nest using an
+  // iterative approach because it is a tree. We keep the cloned parent in the
+  // data structure to avoid repeatedly querying through a map to find it.
+  SmallVector<std::pair<Loop *, Loop *>, 16> LoopsToClone;
+  // Build up the loops to clone in reverse order as we'll clone them from the
+  // back.
+  for (Loop *ChildL : llvm::reverse(OrigRootL))
+    LoopsToClone.push_back({ClonedRootL, ChildL});
+  do {
+    Loop *ClonedParentL, *L;
+    std::tie(ClonedParentL, L) = LoopsToClone.pop_back_val();
+    Loop *ClonedL = LI.AllocateLoop();
+    ClonedParentL->addChildLoop(ClonedL);
+    AddClonedBlocksToLoop(*L, *ClonedL);
+    for (Loop *ChildL : llvm::reverse(*L))
+      LoopsToClone.push_back({ClonedL, ChildL});
+  } while (!LoopsToClone.empty());
+
+  return ClonedRootL;
+}
+
+/// Build the cloned loops of an original loop from unswitching.
+///
+/// Because unswitching simplifies the CFG of the loop, this isn't a trivial
+/// operation. We need to re-verify that there even is a loop (as the backedge
+/// may not have been cloned), and even if there are remaining backedges the
+/// backedge set may be different. However, we know that each child loop is
+/// undisturbed, we only need to find where to place each child loop within
+/// either any parent loop or within a cloned version of the original loop.
+///
+/// Because child loops may end up cloned outside of any cloned version of the
+/// original loop, multiple cloned sibling loops may be created. All of them
+/// are returned so that the newly introduced loop nest roots can be
+/// identified.
+static Loop *buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks,
+                              const ValueToValueMapTy &VMap, LoopInfo &LI,
+                              SmallVectorImpl<Loop *> &NonChildClonedLoops) {
+  Loop *ClonedL = nullptr;
+
+  auto *OrigPH = OrigL.getLoopPreheader();
+  auto *OrigHeader = OrigL.getHeader();
+
+  auto *ClonedPH = cast<BasicBlock>(VMap.lookup(OrigPH));
+  auto *ClonedHeader = cast<BasicBlock>(VMap.lookup(OrigHeader));
+
+  // We need to know the loops of the cloned exit blocks to even compute the
+  // accurate parent loop. If we only clone exits to some parent of the
+  // original parent, we want to clone into that outer loop. We also keep track
+  // of the loops that our cloned exit blocks participate in.
+  Loop *ParentL = nullptr;
+  SmallVector<BasicBlock *, 4> ClonedExitsInLoops;
+  SmallDenseMap<BasicBlock *, Loop *, 16> ExitLoopMap;
+  ClonedExitsInLoops.reserve(ExitBlocks.size());
+  for (auto *ExitBB : ExitBlocks)
+    if (auto *ClonedExitBB = cast_or_null<BasicBlock>(VMap.lookup(ExitBB)))
+      if (Loop *ExitL = LI.getLoopFor(ExitBB)) {
+        ExitLoopMap[ClonedExitBB] = ExitL;
+        ClonedExitsInLoops.push_back(ClonedExitBB);
+        if (!ParentL || (ParentL != ExitL && ParentL->contains(ExitL)))
+          ParentL = ExitL;
+      }
+  assert((!ParentL || ParentL == OrigL.getParentLoop() ||
+          ParentL->contains(OrigL.getParentLoop())) &&
+         "The computed parent loop should always contain (or be) the parent of "
+         "the original loop.");
+
+  // We build the set of blocks dominated by the cloned header from the set of
+  // cloned blocks out of the original loop. While not all of these will
+  // necessarily be in the cloned loop, it is enough to establish that they
+  // aren't in unreachable cycles, etc.
+  SmallSetVector<BasicBlock *, 16> ClonedLoopBlocks;
+  for (auto *BB : OrigL.blocks())
+    if (auto *ClonedBB = cast_or_null<BasicBlock>(VMap.lookup(BB)))
+      ClonedLoopBlocks.insert(ClonedBB);
+
+  // Rebuild the set of blocks that will end up in the cloned loop. We may have
+  // skipped cloning some region of this loop which can in turn skip some of
+  // the backedges so we have to rebuild the blocks in the loop based on the
+  // backedges that remain after cloning.
+  SmallVector<BasicBlock *, 16> Worklist;
+  SmallPtrSet<BasicBlock *, 16> BlocksInClonedLoop;
+  for (auto *Pred : predecessors(ClonedHeader)) {
+    // The only possible non-loop header predecessor is the preheader because
+    // we know we cloned the loop in simplified form.
+    if (Pred == ClonedPH)
+      continue;
+
+    // Because the loop was in simplified form, the only non-loop predecessor
+    // should be the preheader.
+    assert(ClonedLoopBlocks.count(Pred) && "Found a predecessor of the loop "
+                                           "header other than the preheader "
+                                           "that is not part of the loop!");
+
+    // Insert this block into the loop set and on the first visit (and if it
+    // isn't the header we're currently walking) put it into the worklist to
+    // recurse through.
+    if (BlocksInClonedLoop.insert(Pred).second && Pred != ClonedHeader)
+      Worklist.push_back(Pred);
+  }
+
+  // If we had any backedges then there *is* a cloned loop. Put the header into
+  // the loop set and then walk the worklist backwards to find all the blocks
+  // that remain within the loop after cloning.
+  if (!BlocksInClonedLoop.empty()) {
+    BlocksInClonedLoop.insert(ClonedHeader);
+
+    while (!Worklist.empty()) {
+      BasicBlock *BB = Worklist.pop_back_val();
+      assert(BlocksInClonedLoop.count(BB) &&
+             "Didn't put block into the loop set!");
+
+      // Insert any predecessors that are in the possible set into the cloned
+      // set, and if the insert is successful, add them to the worklist. Note
+      // that we filter on the blocks that are definitely reachable via the
+      // backedge to the loop header so we may prune out dead code within the
+      // cloned loop.
+      for (auto *Pred : predecessors(BB))
+        if (ClonedLoopBlocks.count(Pred) &&
+            BlocksInClonedLoop.insert(Pred).second)
+          Worklist.push_back(Pred);
+    }
+
+    ClonedL = LI.AllocateLoop();
+    if (ParentL) {
+      ParentL->addBasicBlockToLoop(ClonedPH, LI);
+      ParentL->addChildLoop(ClonedL);
+    } else {
+      LI.addTopLevelLoop(ClonedL);
+    }
+
+    ClonedL->reserveBlocks(BlocksInClonedLoop.size());
+    // We don't want to just add the cloned loop blocks based on how we
+    // discovered them. The original order of blocks was carefully built in
+    // a way that doesn't rely on predecessor ordering. Rather than re-invent
+    // that logic, we just re-walk the original blocks (and those of the child
+    // loops) and filter them as we add them into the cloned loop.
+    for (auto *BB : OrigL.blocks()) {
+      auto *ClonedBB = cast_or_null<BasicBlock>(VMap.lookup(BB));
+      if (!ClonedBB || !BlocksInClonedLoop.count(ClonedBB))
+        continue;
+
+      // Directly add the blocks that are only in this loop.
+      if (LI.getLoopFor(BB) == &OrigL) {
+        ClonedL->addBasicBlockToLoop(ClonedBB, LI);
+        continue;
+      }
+
+      // We want to manually add it to this loop and parents.
+      // Registering it with LoopInfo will happen when we clone the top
+      // loop for this block.
+      for (Loop *PL = ClonedL; PL; PL = PL->getParentLoop())
+        PL->addBlockEntry(ClonedBB);
+    }
+
+    // Now add each child loop whose header remains within the cloned loop. All
+    // of the blocks within the loop must satisfy the same constraints as the
+    // header so once we pass the header checks we can just clone the entire
+    // child loop nest.
+    for (Loop *ChildL : OrigL) {
+      auto *ClonedChildHeader =
+          cast_or_null<BasicBlock>(VMap.lookup(ChildL->getHeader()));
+      if (!ClonedChildHeader || !BlocksInClonedLoop.count(ClonedChildHeader))
+        continue;
+
+#ifndef NDEBUG
+      // We should never have a cloned child loop header but fail to have
+      // all of the blocks for that child loop.
+      for (auto *ChildLoopBB : ChildL->blocks())
+        assert(BlocksInClonedLoop.count(
+                   cast<BasicBlock>(VMap.lookup(ChildLoopBB))) &&
+               "Child cloned loop has a header within the cloned outer "
+               "loop but not all of its blocks!");
+#endif
+
+      cloneLoopNest(*ChildL, ClonedL, VMap, LI);
+    }
+  }
+
+  // Now that we've handled all the components of the original loop that were
+  // cloned into a new loop, we still need to handle anything from the original
+  // loop that wasn't in a cloned loop.
+
+  // Figure out what blocks are left to place within any loop nest containing
+  // the unswitched loop. If we never formed a loop, the cloned PH is one of
+  // them.
+  SmallPtrSet<BasicBlock *, 16> UnloopedBlockSet;
+  if (BlocksInClonedLoop.empty())
+    UnloopedBlockSet.insert(ClonedPH);
+  for (auto *ClonedBB : ClonedLoopBlocks)
+    if (!BlocksInClonedLoop.count(ClonedBB))
+      UnloopedBlockSet.insert(ClonedBB);
+
+  // Copy the cloned exits and sort them in ascending loop depth, we'll work
+  // backwards across these to process them inside out. The order shouldn't
+  // matter as we're just trying to build up the map from inside-out; we use
+  // the map in a more stably ordered way below.
+  auto OrderedClonedExitsInLoops = ClonedExitsInLoops;
+  std::sort(OrderedClonedExitsInLoops.begin(), OrderedClonedExitsInLoops.end(),
+            [&](BasicBlock *LHS, BasicBlock *RHS) {
+              return ExitLoopMap.lookup(LHS)->getLoopDepth() <
+                     ExitLoopMap.lookup(RHS)->getLoopDepth();
+            });
+
+  // Populate the existing ExitLoopMap with everything reachable from each
+  // exit, starting from the inner most exit.
+  while (!UnloopedBlockSet.empty() && !OrderedClonedExitsInLoops.empty()) {
+    assert(Worklist.empty() && "Didn't clear worklist!");
+
+    BasicBlock *ExitBB = OrderedClonedExitsInLoops.pop_back_val();
+    Loop *ExitL = ExitLoopMap.lookup(ExitBB);
+
+    // Walk the CFG back until we hit the cloned PH adding everything reachable
+    // and in the unlooped set to this exit block's loop.
+    Worklist.push_back(ExitBB);
+    do {
+      BasicBlock *BB = Worklist.pop_back_val();
+      // We can stop recursing at the cloned preheader (if we get there).
+      if (BB == ClonedPH)
+        continue;
+
+      for (BasicBlock *PredBB : predecessors(BB)) {
+        // If this pred has already been moved to our set or is part of some
+        // (inner) loop, no update needed.
+        if (!UnloopedBlockSet.erase(PredBB)) {
+          assert(
+              (BlocksInClonedLoop.count(PredBB) || ExitLoopMap.count(PredBB)) &&
+              "Predecessor not mapped to a loop!");
+          continue;
+        }
+
+        // We just insert into the loop set here. We'll add these blocks to the
+        // exit loop after we build up the set in an order that doesn't rely on
+        // predecessor order (which in turn relies on use list order).
+        bool Inserted = ExitLoopMap.insert({PredBB, ExitL}).second;
+        (void)Inserted;
+        assert(Inserted && "Should only visit an unlooped block once!");
+
+        // And recurse through to its predecessors.
+        Worklist.push_back(PredBB);
+      }
+    } while (!Worklist.empty());
+  }
+
+  // Now that the ExitLoopMap gives as  mapping for all the non-looping cloned
+  // blocks to their outer loops, walk the cloned blocks and the cloned exits
+  // in their original order adding them to the correct loop.
+
+  // We need a stable insertion order. We use the order of the original loop
+  // order and map into the correct parent loop.
+  for (auto *BB : llvm::concat<BasicBlock *const>(
+           makeArrayRef(ClonedPH), ClonedLoopBlocks, ClonedExitsInLoops))
+    if (Loop *OuterL = ExitLoopMap.lookup(BB))
+      OuterL->addBasicBlockToLoop(BB, LI);
+
+#ifndef NDEBUG
+  for (auto &BBAndL : ExitLoopMap) {
+    auto *BB = BBAndL.first;
+    auto *OuterL = BBAndL.second;
+    assert(LI.getLoopFor(BB) == OuterL &&
+           "Failed to put all blocks into outer loops!");
+  }
+#endif
+
+  // Now that all the blocks are placed into the correct containing loop in the
+  // absence of child loops, find all the potentially cloned child loops and
+  // clone them into whatever outer loop we placed their header into.
+  for (Loop *ChildL : OrigL) {
+    auto *ClonedChildHeader =
+        cast_or_null<BasicBlock>(VMap.lookup(ChildL->getHeader()));
+    if (!ClonedChildHeader || BlocksInClonedLoop.count(ClonedChildHeader))
+      continue;
+
+#ifndef NDEBUG
+    for (auto *ChildLoopBB : ChildL->blocks())
+      assert(VMap.count(ChildLoopBB) &&
+             "Cloned a child loop header but not all of that loops blocks!");
+#endif
+
+    NonChildClonedLoops.push_back(cloneLoopNest(
+        *ChildL, ExitLoopMap.lookup(ClonedChildHeader), VMap, LI));
+  }
+
+  // Return the main cloned loop if any.
+  return ClonedL;
+}
+
+static void deleteDeadBlocksFromLoop(Loop &L, BasicBlock *DeadSubtreeRoot,
+                                     SmallVectorImpl<BasicBlock *> &ExitBlocks,
+                                     DominatorTree &DT, LoopInfo &LI) {
+  // Walk the dominator tree to build up the set of blocks we will delete here.
+  // The order is designed to allow us to always delete bottom-up and avoid any
+  // dangling uses.
+  SmallSetVector<BasicBlock *, 16> DeadBlocks;
+  DeadBlocks.insert(DeadSubtreeRoot);
+  for (int i = 0; i < (int)DeadBlocks.size(); ++i)
+    for (DomTreeNode *ChildN : *DT[DeadBlocks[i]]) {
+      // FIXME: This assert should pass and that means we don't change nearly
+      // as much below! Consider rewriting all of this to avoid deleting
+      // blocks. They are always cloned before being deleted, and so instead
+      // could just be moved.
+      // FIXME: This in turn means that we might actually be more able to
+      // update the domtree.
+      assert((L.contains(ChildN->getBlock()) ||
+              llvm::find(ExitBlocks, ChildN->getBlock()) != ExitBlocks.end()) &&
+             "Should never reach beyond the loop and exits when deleting!");
+      DeadBlocks.insert(ChildN->getBlock());
+    }
+
+  // Filter out the dead blocks from the exit blocks list so that it can be
+  // used in the caller.
+  llvm::erase_if(ExitBlocks,
+                 [&](BasicBlock *BB) { return DeadBlocks.count(BB); });
+
+  // Remove these blocks from their successors.
+  for (auto *BB : DeadBlocks)
+    for (BasicBlock *SuccBB : successors(BB))
+      SuccBB->removePredecessor(BB, /*DontDeleteUselessPHIs*/ true);
+
+  // Walk from this loop up through its parents removing all of the dead blocks.
+  for (Loop *ParentL = &L; ParentL; ParentL = ParentL->getParentLoop()) {
+    for (auto *BB : DeadBlocks)
+      ParentL->getBlocksSet().erase(BB);
+    llvm::erase_if(ParentL->getBlocksVector(),
+                   [&](BasicBlock *BB) { return DeadBlocks.count(BB); });
+  }
+
+  // Now delete the dead child loops. This raw delete will clear them
+  // recursively.
+  llvm::erase_if(L.getSubLoopsVector(), [&](Loop *ChildL) {
+    if (!DeadBlocks.count(ChildL->getHeader()))
+      return false;
+
+    assert(llvm::all_of(ChildL->blocks(),
+                        [&](BasicBlock *ChildBB) {
+                          return DeadBlocks.count(ChildBB);
+                        }) &&
+           "If the child loop header is dead all blocks in the child loop must "
+           "be dead as well!");
+    LI.destroy(ChildL);
+    return true;
+  });
+
+  // Remove the mappings for the dead blocks.
+  for (auto *BB : DeadBlocks)
+    LI.changeLoopFor(BB, nullptr);
+
+  // Drop all the references from these blocks to others to handle cyclic
+  // references as we start deleting the blocks themselves.
+  for (auto *BB : DeadBlocks)
+    BB->dropAllReferences();
+
+  for (auto *BB : llvm::reverse(DeadBlocks)) {
+    DT.eraseNode(BB);
+    BB->eraseFromParent();
+  }
+}
+
+/// Recompute the set of blocks in a loop after unswitching.
+///
+/// This walks from the original headers predecessors to rebuild the loop. We
+/// take advantage of the fact that new blocks can't have been added, and so we
+/// filter by the original loop's blocks. This also handles potentially
+/// unreachable code that we don't want to explore but might be found examining
+/// the predecessors of the header.
+///
+/// If the original loop is no longer a loop, this will return an empty set. If
+/// it remains a loop, all the blocks within it will be added to the set
+/// (including those blocks in inner loops).
+static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L,
+                                                                 LoopInfo &LI) {
+  SmallPtrSet<const BasicBlock *, 16> LoopBlockSet;
+
+  auto *PH = L.getLoopPreheader();
+  auto *Header = L.getHeader();
+
+  // A worklist to use while walking backwards from the header.
+  SmallVector<BasicBlock *, 16> Worklist;
+
+  // First walk the predecessors of the header to find the backedges. This will
+  // form the basis of our walk.
+  for (auto *Pred : predecessors(Header)) {
+    // Skip the preheader.
+    if (Pred == PH)
+      continue;
+
+    // Because the loop was in simplified form, the only non-loop predecessor
+    // is the preheader.
+    assert(L.contains(Pred) && "Found a predecessor of the loop header other "
+                               "than the preheader that is not part of the "
+                               "loop!");
+
+    // Insert this block into the loop set and on the first visit and, if it
+    // isn't the header we're currently walking, put it into the worklist to
+    // recurse through.
+    if (LoopBlockSet.insert(Pred).second && Pred != Header)
+      Worklist.push_back(Pred);
+  }
+
+  // If no backedges were found, we're done.
+  if (LoopBlockSet.empty())
+    return LoopBlockSet;
+
+  // Add the loop header to the set.
+  LoopBlockSet.insert(Header);
+
+  // We found backedges, recurse through them to identify the loop blocks.
+  while (!Worklist.empty()) {
+    BasicBlock *BB = Worklist.pop_back_val();
+    assert(LoopBlockSet.count(BB) && "Didn't put block into the loop set!");
+
+    // Because we know the inner loop structure remains valid we can use the
+    // loop structure to jump immediately across the entire nested loop.
+    // Further, because it is in loop simplified form, we can directly jump
+    // to its preheader afterward.
+    if (Loop *InnerL = LI.getLoopFor(BB))
+      if (InnerL != &L) {
+        assert(L.contains(InnerL) &&
+               "Should not reach a loop *outside* this loop!");
+        // The preheader is the only possible predecessor of the loop so
+        // insert it into the set and check whether it was already handled.
+        auto *InnerPH = InnerL->getLoopPreheader();
+        assert(L.contains(InnerPH) && "Cannot contain an inner loop block "
+                                      "but not contain the inner loop "
+                                      "preheader!");
+        if (!LoopBlockSet.insert(InnerPH).second)
+          // The only way to reach the preheader is through the loop body
+          // itself so if it has been visited the loop is already handled.
+          continue;
+
+        // Insert all of the blocks (other than those already present) into
+        // the loop set. The only block we expect to already be in the set is
+        // the one we used to find this loop as we immediately handle the
+        // others the first time we encounter the loop.
+        for (auto *InnerBB : InnerL->blocks()) {
+          if (InnerBB == BB) {
+            assert(LoopBlockSet.count(InnerBB) &&
+                   "Block should already be in the set!");
+            continue;
+          }
+
+          bool Inserted = LoopBlockSet.insert(InnerBB).second;
+          (void)Inserted;
+          assert(Inserted && "Should only insert an inner loop once!");
+        }
+
+        // Add the preheader to the worklist so we will continue past the
+        // loop body.
+        Worklist.push_back(InnerPH);
+        continue;
+      }
+
+    // Insert any predecessors that were in the original loop into the new
+    // set, and if the insert is successful, add them to the worklist.
+    for (auto *Pred : predecessors(BB))
+      if (L.contains(Pred) && LoopBlockSet.insert(Pred).second)
+        Worklist.push_back(Pred);
+  }
+
+  // We've found all the blocks participating in the loop, return our completed
+  // set.
+  return LoopBlockSet;
+}
+
+/// Rebuild a loop after unswitching removes some subset of blocks and edges.
+///
+/// The removal may have removed some child loops entirely but cannot have
+/// disturbed any remaining child loops. However, they may need to be hoisted
+/// to the parent loop (or to be top-level loops). The original loop may be
+/// completely removed.
+///
+/// The sibling loops resulting from this update are returned. If the original
+/// loop remains a valid loop, it will be the first entry in this list with all
+/// of the newly sibling loops following it.
+///
+/// Returns true if the loop remains a loop after unswitching, and false if it
+/// is no longer a loop after unswitching (and should not continue to be
+/// referenced).
+static bool rebuildLoopAfterUnswitch(Loop &L, ArrayRef<BasicBlock *> ExitBlocks,
+                                     LoopInfo &LI,
+                                     SmallVectorImpl<Loop *> &HoistedLoops) {
+  auto *PH = L.getLoopPreheader();
+
+  // Compute the actual parent loop from the exit blocks. Because we may have
+  // pruned some exits the loop may be different from the original parent.
+  Loop *ParentL = nullptr;
+  SmallVector<Loop *, 4> ExitLoops;
+  SmallVector<BasicBlock *, 4> ExitsInLoops;
+  ExitsInLoops.reserve(ExitBlocks.size());
+  for (auto *ExitBB : ExitBlocks)
+    if (Loop *ExitL = LI.getLoopFor(ExitBB)) {
+      ExitLoops.push_back(ExitL);
+      ExitsInLoops.push_back(ExitBB);
+      if (!ParentL || (ParentL != ExitL && ParentL->contains(ExitL)))
+        ParentL = ExitL;
+    }
+
+  // Recompute the blocks participating in this loop. This may be empty if it
+  // is no longer a loop.
+  auto LoopBlockSet = recomputeLoopBlockSet(L, LI);
+
+  // If we still have a loop, we need to re-set the loop's parent as the exit
+  // block set changing may have moved it within the loop nest. Note that this
+  // can only happen when this loop has a parent as it can only hoist the loop
+  // *up* the nest.
+  if (!LoopBlockSet.empty() && L.getParentLoop() != ParentL) {
+    // Remove this loop's (original) blocks from all of the intervening loops.
+    for (Loop *IL = L.getParentLoop(); IL != ParentL;
+         IL = IL->getParentLoop()) {
+      IL->getBlocksSet().erase(PH);
+      for (auto *BB : L.blocks())
+        IL->getBlocksSet().erase(BB);
+      llvm::erase_if(IL->getBlocksVector(), [&](BasicBlock *BB) {
+        return BB == PH || L.contains(BB);
+      });
+    }
+
+    LI.changeLoopFor(PH, ParentL);
+    L.getParentLoop()->removeChildLoop(&L);
+    if (ParentL)
+      ParentL->addChildLoop(&L);
+    else
+      LI.addTopLevelLoop(&L);
+  }
+
+  // Now we update all the blocks which are no longer within the loop.
+  auto &Blocks = L.getBlocksVector();
+  auto BlocksSplitI =
+      LoopBlockSet.empty()
+          ? Blocks.begin()
+          : std::stable_partition(
+                Blocks.begin(), Blocks.end(),
+                [&](BasicBlock *BB) { return LoopBlockSet.count(BB); });
+
+  // Before we erase the list of unlooped blocks, build a set of them.
+  SmallPtrSet<BasicBlock *, 16> UnloopedBlocks(BlocksSplitI, Blocks.end());
+  if (LoopBlockSet.empty())
+    UnloopedBlocks.insert(PH);
+
+  // Now erase these blocks from the loop.
+  for (auto *BB : make_range(BlocksSplitI, Blocks.end()))
+    L.getBlocksSet().erase(BB);
+  Blocks.erase(BlocksSplitI, Blocks.end());
+
+  // Sort the exits in ascending loop depth, we'll work backwards across these
+  // to process them inside out.
+  std::stable_sort(ExitsInLoops.begin(), ExitsInLoops.end(),
+                   [&](BasicBlock *LHS, BasicBlock *RHS) {
+                     return LI.getLoopDepth(LHS) < LI.getLoopDepth(RHS);
+                   });
+
+  // We'll build up a set for each exit loop.
+  SmallPtrSet<BasicBlock *, 16> NewExitLoopBlocks;
+  Loop *PrevExitL = L.getParentLoop(); // The deepest possible exit loop.
+
+  auto RemoveUnloopedBlocksFromLoop =
+      [](Loop &L, SmallPtrSetImpl<BasicBlock *> &UnloopedBlocks) {
+        for (auto *BB : UnloopedBlocks)
+          L.getBlocksSet().erase(BB);
+        llvm::erase_if(L.getBlocksVector(), [&](BasicBlock *BB) {
+          return UnloopedBlocks.count(BB);
+        });
+      };
+
+  SmallVector<BasicBlock *, 16> Worklist;
+  while (!UnloopedBlocks.empty() && !ExitsInLoops.empty()) {
+    assert(Worklist.empty() && "Didn't clear worklist!");
+    assert(NewExitLoopBlocks.empty() && "Didn't clear loop set!");
+
+    // Grab the next exit block, in decreasing loop depth order.
+    BasicBlock *ExitBB = ExitsInLoops.pop_back_val();
+    Loop &ExitL = *LI.getLoopFor(ExitBB);
+    assert(ExitL.contains(&L) && "Exit loop must contain the inner loop!");
+
+    // Erase all of the unlooped blocks from the loops between the previous
+    // exit loop and this exit loop. This works because the ExitInLoops list is
+    // sorted in increasing order of loop depth and thus we visit loops in
+    // decreasing order of loop depth.
+    for (; PrevExitL != &ExitL; PrevExitL = PrevExitL->getParentLoop())
+      RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks);
+
+    // Walk the CFG back until we hit the cloned PH adding everything reachable
+    // and in the unlooped set to this exit block's loop.
+    Worklist.push_back(ExitBB);
+    do {
+      BasicBlock *BB = Worklist.pop_back_val();
+      // We can stop recursing at the cloned preheader (if we get there).
+      if (BB == PH)
+        continue;
+
+      for (BasicBlock *PredBB : predecessors(BB)) {
+        // If this pred has already been moved to our set or is part of some
+        // (inner) loop, no update needed.
+        if (!UnloopedBlocks.erase(PredBB)) {
+          assert((NewExitLoopBlocks.count(PredBB) ||
+                  ExitL.contains(LI.getLoopFor(PredBB))) &&
+                 "Predecessor not in a nested loop (or already visited)!");
+          continue;
+        }
+
+        // We just insert into the loop set here. We'll add these blocks to the
+        // exit loop after we build up the set in a deterministic order rather
+        // than the predecessor-influenced visit order.
+        bool Inserted = NewExitLoopBlocks.insert(PredBB).second;
+        (void)Inserted;
+        assert(Inserted && "Should only visit an unlooped block once!");
+
+        // And recurse through to its predecessors.
+        Worklist.push_back(PredBB);
+      }
+    } while (!Worklist.empty());
+
+    // If blocks in this exit loop were directly part of the original loop (as
+    // opposed to a child loop) update the map to point to this exit loop. This
+    // just updates a map and so the fact that the order is unstable is fine.
+    for (auto *BB : NewExitLoopBlocks)
+      if (Loop *BBL = LI.getLoopFor(BB))
+        if (BBL == &L || !L.contains(BBL))
+          LI.changeLoopFor(BB, &ExitL);
+
+    // We will remove the remaining unlooped blocks from this loop in the next
+    // iteration or below.
+    NewExitLoopBlocks.clear();
+  }
+
+  // Any remaining unlooped blocks are no longer part of any loop unless they
+  // are part of some child loop.
+  for (; PrevExitL; PrevExitL = PrevExitL->getParentLoop())
+    RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks);
+  for (auto *BB : UnloopedBlocks)
+    if (Loop *BBL = LI.getLoopFor(BB))
+      if (BBL == &L || !L.contains(BBL))
+        LI.changeLoopFor(BB, nullptr);
+
+  // Sink all the child loops whose headers are no longer in the loop set to
+  // the parent (or to be top level loops). We reach into the loop and directly
+  // update its subloop vector to make this batch update efficient.
+  auto &SubLoops = L.getSubLoopsVector();
+  auto SubLoopsSplitI =
+      LoopBlockSet.empty()
+          ? SubLoops.begin()
+          : std::stable_partition(
+                SubLoops.begin(), SubLoops.end(), [&](Loop *SubL) {
+                  return LoopBlockSet.count(SubL->getHeader());
+                });
+  for (auto *HoistedL : make_range(SubLoopsSplitI, SubLoops.end())) {
+    HoistedLoops.push_back(HoistedL);
+    HoistedL->setParentLoop(nullptr);
+
+    // To compute the new parent of this hoisted loop we look at where we
+    // placed the preheader above. We can't lookup the header itself because we
+    // retained the mapping from the header to the hoisted loop. But the
+    // preheader and header should have the exact same new parent computed
+    // based on the set of exit blocks from the original loop as the preheader
+    // is a predecessor of the header and so reached in the reverse walk. And
+    // because the loops were all in simplified form the preheader of the
+    // hoisted loop can't be part of some *other* loop.
+    if (auto *NewParentL = LI.getLoopFor(HoistedL->getLoopPreheader()))
+      NewParentL->addChildLoop(HoistedL);
+    else
+      LI.addTopLevelLoop(HoistedL);
+  }
+  SubLoops.erase(SubLoopsSplitI, SubLoops.end());
+
+  // Actually delete the loop if nothing remained within it.
+  if (Blocks.empty()) {
+    assert(SubLoops.empty() &&
+           "Failed to remove all subloops from the original loop!");
+    if (Loop *ParentL = L.getParentLoop())
+      ParentL->removeChildLoop(llvm::find(*ParentL, &L));
+    else
+      LI.removeLoop(llvm::find(LI, &L));
+    LI.destroy(&L);
+    return false;
+  }
+
+  return true;
+}
+
+/// Helper to visit a dominator subtree, invoking a callable on each node.
+///
+/// Returning false at any point will stop walking past that node of the tree.
+template <typename CallableT>
+void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) {
+  SmallVector<DomTreeNode *, 4> DomWorklist;
+  DomWorklist.push_back(DT[BB]);
+#ifndef NDEBUG
+  SmallPtrSet<DomTreeNode *, 4> Visited;
+  Visited.insert(DT[BB]);
+#endif
+  do {
+    DomTreeNode *N = DomWorklist.pop_back_val();
+
+    // Visit this node.
+    if (!Callable(N->getBlock()))
+      continue;
+
+    // Accumulate the child nodes.
+    for (DomTreeNode *ChildN : *N) {
+      assert(Visited.insert(ChildN).second &&
+             "Cannot visit a node twice when walking a tree!");
+      DomWorklist.push_back(ChildN);
+    }
+  } while (!DomWorklist.empty());
+}
+
+/// Take an invariant branch that has been determined to be safe and worthwhile
+/// to unswitch despite being non-trivial to do so and perform the unswitch.
+///
+/// This directly updates the CFG to hoist the predicate out of the loop, and
+/// clone the necessary parts of the loop to maintain behavior.
+///
+/// It also updates both dominator tree and loopinfo based on the unswitching.
+///
+/// Once unswitching has been performed it runs the provided callback to report
+/// the new loops and no-longer valid loops to the caller.
+static bool unswitchInvariantBranch(
+    Loop &L, BranchInst &BI, DominatorTree &DT, LoopInfo &LI,
+    AssumptionCache &AC,
+    function_ref<void(bool, ArrayRef<Loop *>)> NonTrivialUnswitchCB) {
+  assert(BI.isConditional() && "Can only unswitch a conditional branch!");
+  assert(L.isLoopInvariant(BI.getCondition()) &&
+         "Can only unswitch an invariant branch condition!");
+
+  // Constant and BBs tracking the cloned and continuing successor.
+  const int ClonedSucc = 0;
+  auto *ParentBB = BI.getParent();
+  auto *UnswitchedSuccBB = BI.getSuccessor(ClonedSucc);
+  auto *ContinueSuccBB = BI.getSuccessor(1 - ClonedSucc);
+
+  assert(UnswitchedSuccBB != ContinueSuccBB &&
+         "Should not unswitch a branch that always goes to the same place!");
+
+  // The branch should be in this exact loop. Any inner loop's invariant branch
+  // should be handled by unswitching that inner loop. The caller of this
+  // routine should filter out any candidates that remain (but were skipped for
+  // whatever reason).
+  assert(LI.getLoopFor(ParentBB) == &L && "Branch in an inner loop!");
+
+  SmallVector<BasicBlock *, 4> ExitBlocks;
+  L.getUniqueExitBlocks(ExitBlocks);
+
+  // We cannot unswitch if exit blocks contain a cleanuppad instruction as we
+  // don't know how to split those exit blocks.
+  // FIXME: We should teach SplitBlock to handle this and remove this
+  // restriction.
+  for (auto *ExitBB : ExitBlocks)
+    if (isa<CleanupPadInst>(ExitBB->getFirstNonPHI()))
+      return false;
+
+  SmallPtrSet<BasicBlock *, 4> ExitBlockSet(ExitBlocks.begin(),
+                                            ExitBlocks.end());
+
+  // Compute the parent loop now before we start hacking on things.
+  Loop *ParentL = L.getParentLoop();
+
+  // Compute the outer-most loop containing one of our exit blocks. This is the
+  // furthest up our loopnest which can be mutated, which we will use below to
+  // update things.
+  Loop *OuterExitL = &L;
+  for (auto *ExitBB : ExitBlocks) {
+    Loop *NewOuterExitL = LI.getLoopFor(ExitBB);
+    if (!NewOuterExitL) {
+      // We exited the entire nest with this block, so we're done.
+      OuterExitL = nullptr;
+      break;
+    }
+    if (NewOuterExitL != OuterExitL && NewOuterExitL->contains(OuterExitL))
+      OuterExitL = NewOuterExitL;
+  }
+
+  // If the edge we *aren't* cloning in the unswitch (the continuing edge)
+  // dominates its target, we can skip cloning the dominated region of the loop
+  // and its exits. We compute this as a set of nodes to be skipped.
+  SmallPtrSet<BasicBlock *, 4> SkippedLoopAndExitBlocks;
+  if (ContinueSuccBB->getUniquePredecessor() ||
+      llvm::all_of(predecessors(ContinueSuccBB), [&](BasicBlock *PredBB) {
+        return PredBB == ParentBB || DT.dominates(ContinueSuccBB, PredBB);
+      })) {
+    visitDomSubTree(DT, ContinueSuccBB, [&](BasicBlock *BB) {
+      SkippedLoopAndExitBlocks.insert(BB);
+      return true;
+    });
+  }
+  // Similarly, if the edge we *are* cloning in the unswitch (the unswitched
+  // edge) dominates its target, we will end up with dead nodes in the original
+  // loop and its exits that will need to be deleted. Here, we just retain that
+  // the property holds and will compute the deleted set later.
+  bool DeleteUnswitchedSucc =
+      UnswitchedSuccBB->getUniquePredecessor() ||
+      llvm::all_of(predecessors(UnswitchedSuccBB), [&](BasicBlock *PredBB) {
+        return PredBB == ParentBB || DT.dominates(UnswitchedSuccBB, PredBB);
+      });
+
+  // Split the preheader, so that we know that there is a safe place to insert
+  // the conditional branch. We will change the preheader to have a conditional
+  // branch on LoopCond. The original preheader will become the split point
+  // between the unswitched versions, and we will have a new preheader for the
+  // original loop.
+  BasicBlock *SplitBB = L.getLoopPreheader();
+  BasicBlock *LoopPH = SplitEdge(SplitBB, L.getHeader(), &DT, &LI);
+
+  // Keep a mapping for the cloned values.
+  ValueToValueMapTy VMap;
+
+  // Build the cloned blocks from the loop.
+  auto *ClonedPH = buildClonedLoopBlocks(
+      L, LoopPH, SplitBB, ExitBlocks, ParentBB, UnswitchedSuccBB,
+      ContinueSuccBB, SkippedLoopAndExitBlocks, VMap, AC, DT, LI);
+
+  // Build the cloned loop structure itself. This may be substantially
+  // different from the original structure due to the simplified CFG. This also
+  // handles inserting all the cloned blocks into the correct loops.
+  SmallVector<Loop *, 4> NonChildClonedLoops;
+  Loop *ClonedL =
+      buildClonedLoops(L, ExitBlocks, VMap, LI, NonChildClonedLoops);
+
+  // Remove the parent as a predecessor of the unswitched successor.
+  UnswitchedSuccBB->removePredecessor(ParentBB, /*DontDeleteUselessPHIs*/ true);
+
+  // Now splice the branch from the original loop and use it to select between
+  // the two loops.
+  SplitBB->getTerminator()->eraseFromParent();
+  SplitBB->getInstList().splice(SplitBB->end(), ParentBB->getInstList(), BI);
+  BI.setSuccessor(ClonedSucc, ClonedPH);
+  BI.setSuccessor(1 - ClonedSucc, LoopPH);
+
+  // Create a new unconditional branch to the continuing block (as opposed to
+  // the one cloned).
+  BranchInst::Create(ContinueSuccBB, ParentBB);
+
+  // Delete anything that was made dead in the original loop due to
+  // unswitching.
+  if (DeleteUnswitchedSucc)
+    deleteDeadBlocksFromLoop(L, UnswitchedSuccBB, ExitBlocks, DT, LI);
+
+  SmallVector<Loop *, 4> HoistedLoops;
+  bool IsStillLoop = rebuildLoopAfterUnswitch(L, ExitBlocks, LI, HoistedLoops);
+
+  // This will have completely invalidated the dominator tree. We can't easily
+  // bound how much is invalid because in some cases we will refine the
+  // predecessor set of exit blocks of the loop which can move large unrelated
+  // regions of code into a new subtree.
+  //
+  // FIXME: Eventually, we should use an incremental update utility that
+  // leverages the existing information in the dominator tree (and potentially
+  // the nature of the change) to more efficiently update things.
+  DT.recalculate(*SplitBB->getParent());
+
+  // We can change which blocks are exit blocks of all the cloned sibling
+  // loops, the current loop, and any parent loops which shared exit blocks
+  // with the current loop. As a consequence, we need to re-form LCSSA for
+  // them. But we shouldn't need to re-form LCSSA for any child loops.
+  // FIXME: This could be made more efficient by tracking which exit blocks are
+  // new, and focusing on them, but that isn't likely to be necessary.
+  //
+  // In order to reasonably rebuild LCSSA we need to walk inside-out across the
+  // loop nest and update every loop that could have had its exits changed. We
+  // also need to cover any intervening loops. We add all of these loops to
+  // a list and sort them by loop depth to achieve this without updating
+  // unnecessary loops.
+  auto UpdateLCSSA = [&](Loop &UpdateL) {
+#ifndef NDEBUG
+    for (Loop *ChildL : UpdateL)
+      assert(ChildL->isRecursivelyLCSSAForm(DT, LI) &&
+             "Perturbed a child loop's LCSSA form!");
+#endif
+    formLCSSA(UpdateL, DT, &LI, nullptr);
+  };
+
+  // For non-child cloned loops and hoisted loops, we just need to update LCSSA
+  // and we can do it in any order as they don't nest relative to each other.
+  for (Loop *UpdatedL : llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops))
+    UpdateLCSSA(*UpdatedL);
+
+  // If the original loop had exit blocks, walk up through the outer most loop
+  // of those exit blocks to update LCSSA and form updated dedicated exits.
+  if (OuterExitL != &L) {
+    SmallVector<Loop *, 4> OuterLoops;
+    // We start with the cloned loop and the current loop if they are loops and
+    // move toward OuterExitL. Also, if either the cloned loop or the current
+    // loop have become top level loops we need to walk all the way out.
+    if (ClonedL) {
+      OuterLoops.push_back(ClonedL);
+      if (!ClonedL->getParentLoop())
+        OuterExitL = nullptr;
+    }
+    if (IsStillLoop) {
+      OuterLoops.push_back(&L);
+      if (!L.getParentLoop())
+        OuterExitL = nullptr;
+    }
+    // Grab all of the enclosing loops now.
+    for (Loop *OuterL = ParentL; OuterL != OuterExitL;
+         OuterL = OuterL->getParentLoop())
+      OuterLoops.push_back(OuterL);
+
+    // Finally, update our list of outer loops. This is nicely ordered to work
+    // inside-out.
+    for (Loop *OuterL : OuterLoops) {
+      // First build LCSSA for this loop so that we can preserve it when
+      // forming dedicated exits. We don't want to perturb some other loop's
+      // LCSSA while doing that CFG edit.
+      UpdateLCSSA(*OuterL);
+
+      // For loops reached by this loop's original exit blocks we may
+      // introduced new, non-dedicated exits. At least try to re-form dedicated
+      // exits for these loops. This may fail if they couldn't have dedicated
+      // exits to start with.
+      formDedicatedExitBlocks(OuterL, &DT, &LI, /*PreserveLCSSA*/ true);
+    }
+  }
+
+#ifndef NDEBUG
+  // Verify the entire loop structure to catch any incorrect updates before we
+  // progress in the pass pipeline.
+  LI.verify(DT);
+#endif
+
+  // Now that we've unswitched something, make callbacks to report the changes.
+  // For that we need to merge together the updated loops and the cloned loops
+  // and check whether the original loop survived.
+  SmallVector<Loop *, 4> SibLoops;
+  for (Loop *UpdatedL : llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops))
+    if (UpdatedL->getParentLoop() == ParentL)
+      SibLoops.push_back(UpdatedL);
+  NonTrivialUnswitchCB(IsStillLoop, SibLoops);
+
+  ++NumBranches;
+  return true;
+}
+
+/// Recursively compute the cost of a dominator subtree based on the per-block
+/// cost map provided.
+///
+/// The recursive computation is memozied into the provided DT-indexed cost map
+/// to allow querying it for most nodes in the domtree without it becoming
+/// quadratic.
+static int
+computeDomSubtreeCost(DomTreeNode &N,
+                      const SmallDenseMap<BasicBlock *, int, 4> &BBCostMap,
+                      SmallDenseMap<DomTreeNode *, int, 4> &DTCostMap) {
+  // Don't accumulate cost (or recurse through) blocks not in our block cost
+  // map and thus not part of the duplication cost being considered.
+  auto BBCostIt = BBCostMap.find(N.getBlock());
+  if (BBCostIt == BBCostMap.end())
+    return 0;
+
+  // Lookup this node to see if we already computed its cost.
+  auto DTCostIt = DTCostMap.find(&N);
+  if (DTCostIt != DTCostMap.end())
+    return DTCostIt->second;
+
+  // If not, we have to compute it. We can't use insert above and update
+  // because computing the cost may insert more things into the map.
+  int Cost = std::accumulate(
+      N.begin(), N.end(), BBCostIt->second, [&](int Sum, DomTreeNode *ChildN) {
+        return Sum + computeDomSubtreeCost(*ChildN, BBCostMap, DTCostMap);
+      });
+  bool Inserted = DTCostMap.insert({&N, Cost}).second;
+  (void)Inserted;
+  assert(Inserted && "Should not insert a node while visiting children!");
+  return Cost;
+}
+
 /// Unswitch control flow predicated on loop invariant conditions.
 ///
 /// This first hoists all branches or switches which are trivial (IE, do not
 /// require duplicating any part of the loop) out of the loop body. It then
 /// looks at other loop invariant control flows and tries to unswitch those as
 /// well by cloning the loop if the result is small enough.
-static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
-                         AssumptionCache &AC) {
-  assert(L.isLCSSAForm(DT) &&
+static bool
+unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
+             TargetTransformInfo &TTI, bool NonTrivial,
+             function_ref<void(bool, ArrayRef<Loop *>)> NonTrivialUnswitchCB) {
+  assert(L.isRecursivelyLCSSAForm(DT, LI) &&
          "Loops must be in LCSSA form before unswitching.");
   bool Changed = false;
 
@@ -727,7 +1926,136 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
   // Try trivial unswitch first before loop over other basic blocks in the loop.
   Changed |= unswitchAllTrivialConditions(L, DT, LI);
 
-  // FIXME: Add support for non-trivial unswitching by cloning the loop.
+  // If we're not doing non-trivial unswitching, we're done. We both accept
+  // a parameter but also check a local flag that can be used for testing
+  // a debugging.
+  if (!NonTrivial && !EnableNonTrivialUnswitch)
+    return Changed;
+
+  // Collect all remaining invariant branch conditions within this loop (as
+  // opposed to an inner loop which would be handled when visiting that inner
+  // loop).
+  SmallVector<TerminatorInst *, 4> UnswitchCandidates;
+  for (auto *BB : L.blocks())
+    if (LI.getLoopFor(BB) == &L)
+      if (auto *BI = dyn_cast<BranchInst>(BB->getTerminator()))
+        if (BI->isConditional() && L.isLoopInvariant(BI->getCondition()) &&
+            BI->getSuccessor(0) != BI->getSuccessor(1))
+          UnswitchCandidates.push_back(BI);
+
+  // If we didn't find any candidates, we're done.
+  if (UnswitchCandidates.empty())
+    return Changed;
+
+  DEBUG(dbgs() << "Considering " << UnswitchCandidates.size()
+               << " non-trivial loop invariant conditions for unswitching.\n");
+
+  // Given that unswitching these terminators will require duplicating parts of
+  // the loop, so we need to be able to model that cost. Compute the ephemeral
+  // values and set up a data structure to hold per-BB costs. We cache each
+  // block's cost so that we don't recompute this when considering different
+  // subsets of the loop for duplication during unswitching.
+  SmallPtrSet<const Value *, 4> EphValues;
+  CodeMetrics::collectEphemeralValues(&L, &AC, EphValues);
+  SmallDenseMap<BasicBlock *, int, 4> BBCostMap;
+
+  // Compute the cost of each block, as well as the total loop cost. Also, bail
+  // out if we see instructions which are incompatible with loop unswitching
+  // (convergent, noduplicate, or cross-basic-block tokens).
+  // FIXME: We might be able to safely handle some of these in non-duplicated
+  // regions.
+  int LoopCost = 0;
+  for (auto *BB : L.blocks()) {
+    int Cost = 0;
+    for (auto &I : *BB) {
+      if (EphValues.count(&I))
+        continue;
+
+      if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB))
+        return Changed;
+      if (auto CS = CallSite(&I))
+        if (CS.isConvergent() || CS.cannotDuplicate())
+          return Changed;
+
+      Cost += TTI.getUserCost(&I);
+    }
+    assert(Cost >= 0 && "Must not have negative costs!");
+    LoopCost += Cost;
+    assert(LoopCost >= 0 && "Must not have negative loop costs!");
+    BBCostMap[BB] = Cost;
+  }
+  DEBUG(dbgs() << "  Total loop cost: " << LoopCost << "\n");
+
+  // Now we find the best candidate by searching for the one with the following
+  // properties in order:
+  //
+  // 1) An unswitching cost below the threshold
+  // 2) The smallest number of duplicated unswitch candidates (to avoid
+  //    creating redundant subsequent unswitching)
+  // 3) The smallest cost after unswitching.
+  //
+  // We prioritize reducing fanout of unswitch candidates provided the cost
+  // remains below the threshold because this has a multiplicative effect.
+  //
+  // This requires memoizing each dominator subtree to avoid redundant work.
+  //
+  // FIXME: Need to actually do the number of candidates part above.
+  SmallDenseMap<DomTreeNode *, int, 4> DTCostMap;
+  // Given a terminator which might be unswitched, computes the non-duplicated
+  // cost for that terminator.
+  auto ComputeUnswitchedCost = [&](TerminatorInst *TI) {
+    BasicBlock &BB = *TI->getParent();
+    SmallPtrSet<BasicBlock *, 4> Visited;
+
+    int Cost = LoopCost;
+    for (BasicBlock *SuccBB : successors(&BB)) {
+      // Don't count successors more than once.
+      if (!Visited.insert(SuccBB).second)
+        continue;
+
+      // This successor's domtree will not need to be duplicated after
+      // unswitching if the edge to the successor dominates it (and thus the
+      // entire tree). This essentially means there is no other path into this
+      // subtree and so it will end up live in only one clone of the loop.
+      if (SuccBB->getUniquePredecessor() ||
+          llvm::all_of(predecessors(SuccBB), [&](BasicBlock *PredBB) {
+            return PredBB == &BB || DT.dominates(SuccBB, PredBB);
+          })) {
+        Cost -= computeDomSubtreeCost(*DT[SuccBB], BBCostMap, DTCostMap);
+        assert(Cost >= 0 &&
+               "Non-duplicated cost should never exceed total loop cost!");
+      }
+    }
+
+    // Now scale the cost by the number of unique successors minus one. We
+    // subtract one because there is already at least one copy of the entire
+    // loop. This is computing the new cost of unswitching a condition.
+    assert(Visited.size() > 1 &&
+           "Cannot unswitch a condition without multiple distinct successors!");
+    return Cost * (Visited.size() - 1);
+  };
+  TerminatorInst *BestUnswitchTI = nullptr;
+  int BestUnswitchCost;
+  for (TerminatorInst *CandidateTI : UnswitchCandidates) {
+    int CandidateCost = ComputeUnswitchedCost(CandidateTI);
+    DEBUG(dbgs() << "  Computed cost of " << CandidateCost
+                 << " for unswitch candidate: " << *CandidateTI << "\n");
+    if (!BestUnswitchTI || CandidateCost < BestUnswitchCost) {
+      BestUnswitchTI = CandidateTI;
+      BestUnswitchCost = CandidateCost;
+    }
+  }
+
+  if (BestUnswitchCost < UnswitchThreshold) {
+    DEBUG(dbgs() << "  Trying to unswitch non-trivial (cost = "
+                 << BestUnswitchCost << ") branch: " << *BestUnswitchTI
+                 << "\n");
+    Changed |= unswitchInvariantBranch(L, cast<BranchInst>(*BestUnswitchTI), DT,
+                                       LI, AC, NonTrivialUnswitchCB);
+  } else {
+    DEBUG(dbgs() << "Cannot unswitch, lowest cost found: " << BestUnswitchCost
+                 << "\n");
+  }
 
   return Changed;
 }
@@ -740,7 +2068,25 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
 
   DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L << "\n");
 
-  if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC))
+  // Save the current loop name in a variable so that we can report it even
+  // after it has been deleted.
+  std::string LoopName = L.getName();
+
+  auto NonTrivialUnswitchCB = [&L, &U, &LoopName](bool CurrentLoopValid,
+                                                  ArrayRef<Loop *> NewLoops) {
+    // If we did a non-trivial unswitch, we have added new (cloned) loops.
+    U.addSiblingLoops(NewLoops);
+
+    // If the current loop remains valid, we should revisit it to catch any
+    // other unswitch opportunities. Otherwise, we need to mark it as deleted.
+    if (CurrentLoopValid)
+      U.revisitCurrentLoop();
+    else
+      U.markLoopAsDeleted(L, LoopName);
+  };
+
+  if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.TTI, NonTrivial,
+                    NonTrivialUnswitchCB))
     return PreservedAnalyses::all();
 
 #ifndef NDEBUG
@@ -754,10 +2100,13 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
 namespace {
 
 class SimpleLoopUnswitchLegacyPass : public LoopPass {
+  bool NonTrivial;
+
 public:
   static char ID; // Pass ID, replacement for typeid
 
-  explicit SimpleLoopUnswitchLegacyPass() : LoopPass(ID) {
+  explicit SimpleLoopUnswitchLegacyPass(bool NonTrivial = false)
+      : LoopPass(ID), NonTrivial(NonTrivial) {
     initializeSimpleLoopUnswitchLegacyPassPass(
         *PassRegistry::getPassRegistry());
   }
@@ -766,6 +2115,7 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
     getLoopAnalysisUsage(AU);
   }
 };
@@ -783,8 +2133,29 @@ bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+  auto NonTrivialUnswitchCB = [&L, &LPM](bool CurrentLoopValid,
+                                         ArrayRef<Loop *> NewLoops) {
+    // If we did a non-trivial unswitch, we have added new (cloned) loops.
+    for (auto *NewL : NewLoops)
+      LPM.addLoop(*NewL);
+
+    // If the current loop remains valid, re-add it to the queue. This is
+    // a little wasteful as we'll finish processing the current loop as well,
+    // but it is the best we can do in the old PM.
+    if (CurrentLoopValid)
+      LPM.addLoop(*L);
+    else
+      LPM.markLoopAsDeleted(*L);
+  };
+
+  bool Changed =
+      unswitchLoop(*L, DT, LI, AC, TTI, NonTrivial, NonTrivialUnswitchCB);
 
-  bool Changed = unswitchLoop(*L, DT, LI, AC);
+  // If anything was unswitched, also clear any cached information about this
+  // loop.
+  LPM.deleteSimpleAnalysisLoop(L);
 
 #ifndef NDEBUG
   // Historically this pass has had issues with the dominator tree so verify it
@@ -798,11 +2169,13 @@ char SimpleLoopUnswitchLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch",
                       "Simple unswitch loops", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch",
                     "Simple unswitch loops", false, false)
 
-Pass *llvm::createSimpleLoopUnswitchLegacyPass() {
-  return new SimpleLoopUnswitchLegacyPass();
+Pass *llvm::createSimpleLoopUnswitchLegacyPass(bool NonTrivial) {
+  return new SimpleLoopUnswitchLegacyPass(NonTrivial);
 }
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 8754c714c5b2..1522170dc3b9 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -45,9 +45,26 @@ using namespace llvm;
 
 #define DEBUG_TYPE "simplifycfg"
 
-static cl::opt<unsigned>
-UserBonusInstThreshold("bonus-inst-threshold", cl::Hidden, cl::init(1),
-   cl::desc("Control the number of bonus instructions (default = 1)"));
+static cl::opt<unsigned> UserBonusInstThreshold(
+    "bonus-inst-threshold", cl::Hidden, cl::init(1),
+    cl::desc("Control the number of bonus instructions (default = 1)"));
+
+static cl::opt<bool> UserKeepLoops(
+    "keep-loops", cl::Hidden, cl::init(true),
+    cl::desc("Preserve canonical loop structure (default = true)"));
+
+static cl::opt<bool> UserSwitchToLookup(
+    "switch-to-lookup", cl::Hidden, cl::init(false),
+    cl::desc("Convert switches to lookup tables (default = false)"));
+
+static cl::opt<bool> UserForwardSwitchCond(
+    "forward-switch-cond", cl::Hidden, cl::init(false),
+    cl::desc("Forward switch condition to phi ops (default = false)"));
+
+static cl::opt<bool> UserSinkCommonInsts(
+    "sink-common-insts", cl::Hidden, cl::init(false),
+    cl::desc("Sink common instructions (default = false)"));
+
 
 STATISTIC(NumSimpl, "Number of blocks simplified");
 
@@ -129,9 +146,7 @@ static bool mergeEmptyReturnBlocks(Function &F) {
 /// Call SimplifyCFG on all the blocks in the function,
 /// iterating until no more changes are made.
 static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
-                                   AssumptionCache *AC,
-                                   unsigned BonusInstThreshold,
-                                   bool LateSimplifyCFG) {
+                                   const SimplifyCFGOptions &Options) {
   bool Changed = false;
   bool LocalChange = true;
 
@@ -146,7 +161,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
 
     // Loop over all of the basic blocks and remove them if they are unneeded.
     for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
-      if (SimplifyCFG(&*BBIt++, TTI, BonusInstThreshold, AC, &LoopHeaders, LateSimplifyCFG)) {
+      if (simplifyCFG(&*BBIt++, TTI, Options, &LoopHeaders)) {
         LocalChange = true;
         ++NumSimpl;
       }
@@ -157,12 +172,10 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
 }
 
 static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
-                                AssumptionCache *AC, int BonusInstThreshold,
-                                bool LateSimplifyCFG) {
+                                const SimplifyCFGOptions &Options) {
   bool EverChanged = removeUnreachableBlocks(F);
   EverChanged |= mergeEmptyReturnBlocks(F);
-  EverChanged |= iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold,
-                                        LateSimplifyCFG);
+  EverChanged |= iterativelySimplifyCFG(F, TTI, Options);
 
   // If neither pass changed anything, we're done.
   if (!EverChanged) return false;
@@ -176,28 +189,37 @@ static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
     return true;
 
   do {
-    EverChanged = iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold,
-                                         LateSimplifyCFG);
+    EverChanged = iterativelySimplifyCFG(F, TTI, Options);
     EverChanged |= removeUnreachableBlocks(F);
   } while (EverChanged);
 
   return true;
 }
 
-SimplifyCFGPass::SimplifyCFGPass()
-    : BonusInstThreshold(UserBonusInstThreshold),
-      LateSimplifyCFG(true) {}
-
-SimplifyCFGPass::SimplifyCFGPass(int BonusInstThreshold, bool LateSimplifyCFG)
-    : BonusInstThreshold(BonusInstThreshold),
-      LateSimplifyCFG(LateSimplifyCFG) {}
+// Command-line settings override compile-time settings.
+SimplifyCFGPass::SimplifyCFGPass(const SimplifyCFGOptions &Opts) {
+  Options.BonusInstThreshold = UserBonusInstThreshold.getNumOccurrences()
+                                   ? UserBonusInstThreshold
+                                   : Opts.BonusInstThreshold;
+  Options.ForwardSwitchCondToPhi = UserForwardSwitchCond.getNumOccurrences()
+                                       ? UserForwardSwitchCond
+                                       : Opts.ForwardSwitchCondToPhi;
+  Options.ConvertSwitchToLookupTable = UserSwitchToLookup.getNumOccurrences()
+                                           ? UserSwitchToLookup
+                                           : Opts.ConvertSwitchToLookupTable;
+  Options.NeedCanonicalLoop = UserKeepLoops.getNumOccurrences()
+                                  ? UserKeepLoops
+                                  : Opts.NeedCanonicalLoop;
+  Options.SinkCommonInsts = UserSinkCommonInsts.getNumOccurrences()
+                                ? UserSinkCommonInsts
+                                : Opts.SinkCommonInsts;
+}
 
 PreservedAnalyses SimplifyCFGPass::run(Function &F,
                                        FunctionAnalysisManager &AM) {
   auto &TTI = AM.getResult<TargetIRAnalysis>(F);
-  auto &AC = AM.getResult<AssumptionAnalysis>(F);
-
-  if (!simplifyFunctionCFG(F, TTI, &AC, BonusInstThreshold, LateSimplifyCFG))
+  Options.AC = &AM.getResult<AssumptionAnalysis>(F);
+  if (!simplifyFunctionCFG(F, TTI, Options))
     return PreservedAnalyses::all();
   PreservedAnalyses PA;
   PA.preserve<GlobalsAA>();
@@ -205,55 +227,54 @@ PreservedAnalyses SimplifyCFGPass::run(Function &F,
 }
 
 namespace {
-struct BaseCFGSimplifyPass : public FunctionPass {
-  unsigned BonusInstThreshold;
+struct CFGSimplifyPass : public FunctionPass {
+  static char ID;
+  SimplifyCFGOptions Options;
   std::function<bool(const Function &)> PredicateFtor;
-  bool LateSimplifyCFG;
 
-  BaseCFGSimplifyPass(int T, bool LateSimplifyCFG,
-                      std::function<bool(const Function &)> Ftor,
-                      char &ID)
-      : FunctionPass(ID), PredicateFtor(std::move(Ftor)),
-        LateSimplifyCFG(LateSimplifyCFG) {
-    BonusInstThreshold = (T == -1) ? UserBonusInstThreshold : unsigned(T);
+  CFGSimplifyPass(unsigned Threshold = 1, bool ForwardSwitchCond = false,
+                  bool ConvertSwitch = false, bool KeepLoops = true,
+                  bool SinkCommon = false,
+                  std::function<bool(const Function &)> Ftor = nullptr)
+      : FunctionPass(ID), PredicateFtor(std::move(Ftor)) {
+
+    initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
+
+    // Check for command-line overrides of options for debug/customization.
+    Options.BonusInstThreshold = UserBonusInstThreshold.getNumOccurrences()
+                                    ? UserBonusInstThreshold
+                                    : Threshold;
+
+    Options.ForwardSwitchCondToPhi = UserForwardSwitchCond.getNumOccurrences()
+                                         ? UserForwardSwitchCond
+                                         : ForwardSwitchCond;
+
+    Options.ConvertSwitchToLookupTable = UserSwitchToLookup.getNumOccurrences()
+                                             ? UserSwitchToLookup
+                                             : ConvertSwitch;
+
+    Options.NeedCanonicalLoop =
+        UserKeepLoops.getNumOccurrences() ? UserKeepLoops : KeepLoops;
+
+    Options.SinkCommonInsts = UserSinkCommonInsts.getNumOccurrences()
+                                  ? UserSinkCommonInsts
+                                  : SinkCommon;
   }
+
   bool runOnFunction(Function &F) override {
     if (skipFunction(F) || (PredicateFtor && !PredicateFtor(F)))
       return false;
 
-    AssumptionCache *AC =
-        &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-    const TargetTransformInfo &TTI =
-        getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-    return simplifyFunctionCFG(F, TTI, AC, BonusInstThreshold, LateSimplifyCFG);
+    Options.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    return simplifyFunctionCFG(F, TTI, Options);
   }
-
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
 };
-
-struct CFGSimplifyPass : public BaseCFGSimplifyPass {
-  static char ID; // Pass identification, replacement for typeid
-
-  CFGSimplifyPass(int T = -1,
-                  std::function<bool(const Function &)> Ftor = nullptr)
-                  : BaseCFGSimplifyPass(T, false, Ftor, ID) {
-    initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
-  }
-};
-
-struct LateCFGSimplifyPass : public BaseCFGSimplifyPass {
-  static char ID; // Pass identification, replacement for typeid
-
-  LateCFGSimplifyPass(int T = -1,
-                      std::function<bool(const Function &)> Ftor = nullptr)
-                      : BaseCFGSimplifyPass(T, true, Ftor, ID) {
-    initializeLateCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
-  }
-};
 }
 
 char CFGSimplifyPass::ID = 0;
@@ -264,24 +285,12 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
                     false)
 
-char LateCFGSimplifyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(LateCFGSimplifyPass, "latesimplifycfg",
-                      "Simplify the CFG more aggressively", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_END(LateCFGSimplifyPass, "latesimplifycfg",
-                    "Simplify the CFG more aggressively", false, false)
-
 // Public interface to the CFGSimplification pass
 FunctionPass *
-llvm::createCFGSimplificationPass(int Threshold,
-    std::function<bool(const Function &)> Ftor) {
-  return new CFGSimplifyPass(Threshold, std::move(Ftor));
-}
-
-// Public interface to the LateCFGSimplification pass
-FunctionPass *
-llvm::createLateCFGSimplificationPass(int Threshold, 
+llvm::createCFGSimplificationPass(unsigned Threshold, bool ForwardSwitchCond,
+                                  bool ConvertSwitch, bool KeepLoops,
+                                  bool SinkCommon,
                                   std::function<bool(const Function &)> Ftor) {
-  return new LateCFGSimplifyPass(Threshold, std::move(Ftor));
+  return new CFGSimplifyPass(Threshold, ForwardSwitchCond, ConvertSwitch,
+                             KeepLoops, SinkCommon, std::move(Ftor));
 }
diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp
index 5210f165b874..cfb8a062299f 100644
--- a/lib/Transforms/Scalar/Sink.cpp
+++ b/lib/Transforms/Scalar/Sink.cpp
@@ -68,7 +68,7 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA,
   if (LoadInst *L = dyn_cast<LoadInst>(Inst)) {
     MemoryLocation Loc = MemoryLocation::get(L);
     for (Instruction *S : Stores)
-      if (AA.getModRefInfo(S, Loc) & MRI_Mod)
+      if (isModSet(AA.getModRefInfo(S, Loc)))
         return false;
   }
 
@@ -83,7 +83,7 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA,
       return false;
 
     for (Instruction *S : Stores)
-      if (AA.getModRefInfo(S, CS) & MRI_Mod)
+      if (isModSet(AA.getModRefInfo(S, CS)))
         return false;
   }
 
diff --git a/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp b/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
new file mode 100644
index 000000000000..23156d5a4d83
--- /dev/null
+++ b/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
@@ -0,0 +1,811 @@
+//===- SpeculateAroundPHIs.cpp --------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/SpeculateAroundPHIs.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "spec-phis"
+
+STATISTIC(NumPHIsSpeculated, "Number of PHI nodes we speculated around");
+STATISTIC(NumEdgesSplit,
+          "Number of critical edges which were split for speculation");
+STATISTIC(NumSpeculatedInstructions,
+          "Number of instructions we speculated around the PHI nodes");
+STATISTIC(NumNewRedundantInstructions,
+          "Number of new, redundant instructions inserted");
+
+/// Check wether speculating the users of a PHI node around the PHI
+/// will be safe.
+///
+/// This checks both that all of the users are safe and also that all of their
+/// operands are either recursively safe or already available along an incoming
+/// edge to the PHI.
+///
+/// This routine caches both all the safe nodes explored in `PotentialSpecSet`
+/// and the chain of nodes that definitively reach any unsafe node in
+/// `UnsafeSet`. By preserving these between repeated calls to this routine for
+/// PHIs in the same basic block, the exploration here can be reused. However,
+/// these caches must no be reused for PHIs in a different basic block as they
+/// reflect what is available along incoming edges.
+static bool
+isSafeToSpeculatePHIUsers(PHINode &PN, DominatorTree &DT,
+                          SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
+                          SmallPtrSetImpl<Instruction *> &UnsafeSet) {
+  auto *PhiBB = PN.getParent();
+  SmallPtrSet<Instruction *, 4> Visited;
+  SmallVector<std::pair<Instruction *, User::value_op_iterator>, 16> DFSStack;
+
+  // Walk each user of the PHI node.
+  for (Use &U : PN.uses()) {
+    auto *UI = cast<Instruction>(U.getUser());
+
+    // Ensure the use post-dominates the PHI node. This ensures that, in the
+    // absence of unwinding, the use will actually be reached.
+    // FIXME: We use a blunt hammer of requiring them to be in the same basic
+    // block. We should consider using actual post-dominance here in the
+    // future.
+    if (UI->getParent() != PhiBB) {
+      DEBUG(dbgs() << "  Unsafe: use in a different BB: " << *UI << "\n");
+      return false;
+    }
+
+    // FIXME: This check is much too conservative. We're not going to move these
+    // instructions onto new dynamic paths through the program unless there is
+    // a call instruction between the use and the PHI node. And memory isn't
+    // changing unless there is a store in that same sequence. We should
+    // probably change this to do at least a limited scan of the intervening
+    // instructions and allow handling stores in easily proven safe cases.
+    if (mayBeMemoryDependent(*UI)) {
+      DEBUG(dbgs() << "  Unsafe: can't speculate use: " << *UI << "\n");
+      return false;
+    }
+
+    // Now do a depth-first search of everything these users depend on to make
+    // sure they are transitively safe. This is a depth-first search, but we
+    // check nodes in preorder to minimize the amount of checking.
+    Visited.insert(UI);
+    DFSStack.push_back({UI, UI->value_op_begin()});
+    do {
+      User::value_op_iterator OpIt;
+      std::tie(UI, OpIt) = DFSStack.pop_back_val();
+
+      while (OpIt != UI->value_op_end()) {
+        auto *OpI = dyn_cast<Instruction>(*OpIt);
+        // Increment to the next operand for whenever we continue.
+        ++OpIt;
+        // No need to visit non-instructions, which can't form dependencies.
+        if (!OpI)
+          continue;
+
+        // Now do the main pre-order checks that this operand is a viable
+        // dependency of something we want to speculate.
+
+        // First do a few checks for instructions that won't require
+        // speculation at all because they are trivially available on the
+        // incoming edge (either through dominance or through an incoming value
+        // to a PHI).
+        //
+        // The cases in the current block will be trivially dominated by the
+        // edge.
+        auto *ParentBB = OpI->getParent();
+        if (ParentBB == PhiBB) {
+          if (isa<PHINode>(OpI)) {
+            // We can trivially map through phi nodes in the same block.
+            continue;
+          }
+        } else if (DT.dominates(ParentBB, PhiBB)) {
+          // Instructions from dominating blocks are already available.
+          continue;
+        }
+
+        // Once we know that we're considering speculating the operand, check
+        // if we've already explored this subgraph and found it to be safe.
+        if (PotentialSpecSet.count(OpI))
+          continue;
+
+        // If we've already explored this subgraph and found it unsafe, bail.
+        // If when we directly test whether this is safe it fails, bail.
+        if (UnsafeSet.count(OpI) || ParentBB != PhiBB ||
+            mayBeMemoryDependent(*OpI)) {
+          DEBUG(dbgs() << "  Unsafe: can't speculate transitive use: " << *OpI
+                       << "\n");
+          // Record the stack of instructions which reach this node as unsafe
+          // so we prune subsequent searches.
+          UnsafeSet.insert(OpI);
+          for (auto &StackPair : DFSStack) {
+            Instruction *I = StackPair.first;
+            UnsafeSet.insert(I);
+          }
+          return false;
+        }
+
+        // Skip any operands we're already recursively checking.
+        if (!Visited.insert(OpI).second)
+          continue;
+
+        // Push onto the stack and descend. We can directly continue this
+        // loop when ascending.
+        DFSStack.push_back({UI, OpIt});
+        UI = OpI;
+        OpIt = OpI->value_op_begin();
+      }
+
+      // This node and all its operands are safe. Go ahead and cache that for
+      // reuse later.
+      PotentialSpecSet.insert(UI);
+
+      // Continue with the next node on the stack.
+    } while (!DFSStack.empty());
+  }
+
+#ifndef NDEBUG
+  // Every visited operand should have been marked as safe for speculation at
+  // this point. Verify this and return success.
+  for (auto *I : Visited)
+    assert(PotentialSpecSet.count(I) &&
+           "Failed to mark a visited instruction as safe!");
+#endif
+  return true;
+}
+
+/// Check whether, in isolation, a given PHI node is both safe and profitable
+/// to speculate users around.
+///
+/// This handles checking whether there are any constant operands to a PHI
+/// which could represent a useful speculation candidate, whether the users of
+/// the PHI are safe to speculate including all their transitive dependencies,
+/// and whether after speculation there will be some cost savings (profit) to
+/// folding the operands into the users of the PHI node. Returns true if both
+/// safe and profitable with relevant cost savings updated in the map and with
+/// an update to the `PotentialSpecSet`. Returns false if either safety or
+/// profitability are absent. Some new entries may be made to the
+/// `PotentialSpecSet` even when this routine returns false, but they remain
+/// conservatively correct.
+///
+/// The profitability check here is a local one, but it checks this in an
+/// interesting way. Beyond checking that the total cost of materializing the
+/// constants will be less than the cost of folding them into their users, it
+/// also checks that no one incoming constant will have a higher cost when
+/// folded into its users rather than materialized. This higher cost could
+/// result in a dynamic *path* that is more expensive even when the total cost
+/// is lower. Currently, all of the interesting cases where this optimization
+/// should fire are ones where it is a no-loss operation in this sense. If we
+/// ever want to be more aggressive here, we would need to balance the
+/// different incoming edges' cost by looking at their respective
+/// probabilities.
+static bool isSafeAndProfitableToSpeculateAroundPHI(
+    PHINode &PN, SmallDenseMap<PHINode *, int, 16> &CostSavingsMap,
+    SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
+    SmallPtrSetImpl<Instruction *> &UnsafeSet, DominatorTree &DT,
+    TargetTransformInfo &TTI) {
+  // First see whether there is any cost savings to speculating around this
+  // PHI, and build up a map of the constant inputs to how many times they
+  // occur.
+  bool NonFreeMat = false;
+  struct CostsAndCount {
+    int MatCost = TargetTransformInfo::TCC_Free;
+    int FoldedCost = TargetTransformInfo::TCC_Free;
+    int Count = 0;
+  };
+  SmallDenseMap<ConstantInt *, CostsAndCount, 16> CostsAndCounts;
+  SmallPtrSet<BasicBlock *, 16> IncomingConstantBlocks;
+  for (int i : llvm::seq<int>(0, PN.getNumIncomingValues())) {
+    auto *IncomingC = dyn_cast<ConstantInt>(PN.getIncomingValue(i));
+    if (!IncomingC)
+      continue;
+
+    // Only visit each incoming edge with a constant input once.
+    if (!IncomingConstantBlocks.insert(PN.getIncomingBlock(i)).second)
+      continue;
+
+    auto InsertResult = CostsAndCounts.insert({IncomingC, {}});
+    // Count how many edges share a given incoming costant.
+    ++InsertResult.first->second.Count;
+    // Only compute the cost the first time we see a particular constant.
+    if (!InsertResult.second)
+      continue;
+
+    int &MatCost = InsertResult.first->second.MatCost;
+    MatCost = TTI.getIntImmCost(IncomingC->getValue(), IncomingC->getType());
+    NonFreeMat |= MatCost != TTI.TCC_Free;
+  }
+  if (!NonFreeMat) {
+    DEBUG(dbgs() << "    Free: " << PN << "\n");
+    // No profit in free materialization.
+    return false;
+  }
+
+  // Now check that the uses of this PHI can actually be speculated,
+  // otherwise we'll still have to materialize the PHI value.
+  if (!isSafeToSpeculatePHIUsers(PN, DT, PotentialSpecSet, UnsafeSet)) {
+    DEBUG(dbgs() << "    Unsafe PHI: " << PN << "\n");
+    return false;
+  }
+
+  // Compute how much (if any) savings are available by speculating around this
+  // PHI.
+  for (Use &U : PN.uses()) {
+    auto *UserI = cast<Instruction>(U.getUser());
+    // Now check whether there is any savings to folding the incoming constants
+    // into this use.
+    unsigned Idx = U.getOperandNo();
+
+    // If we have a binary operator that is commutative, an actual constant
+    // operand would end up on the RHS, so pretend the use of the PHI is on the
+    // RHS.
+    //
+    // Technically, this is a bit weird if *both* operands are PHIs we're
+    // speculating. But if that is the case, giving an "optimistic" cost isn't
+    // a bad thing because after speculation it will constant fold. And
+    // moreover, such cases should likely have been constant folded already by
+    // some other pass, so we shouldn't worry about "modeling" them terribly
+    // accurately here. Similarly, if the other operand is a constant, it still
+    // seems fine to be "optimistic" in our cost modeling, because when the
+    // incoming operand from the PHI node is also a constant, we will end up
+    // constant folding.
+    if (UserI->isBinaryOp() && UserI->isCommutative() && Idx != 1)
+      // Assume we will commute the constant to the RHS to be canonical.
+      Idx = 1;
+
+    // Get the intrinsic ID if this user is an instrinsic.
+    Intrinsic::ID IID = Intrinsic::not_intrinsic;
+    if (auto *UserII = dyn_cast<IntrinsicInst>(UserI))
+      IID = UserII->getIntrinsicID();
+
+    for (auto &IncomingConstantAndCostsAndCount : CostsAndCounts) {
+      ConstantInt *IncomingC = IncomingConstantAndCostsAndCount.first;
+      int MatCost = IncomingConstantAndCostsAndCount.second.MatCost;
+      int &FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost;
+      if (IID)
+        FoldedCost += TTI.getIntImmCost(IID, Idx, IncomingC->getValue(),
+                                        IncomingC->getType());
+      else
+        FoldedCost +=
+            TTI.getIntImmCost(UserI->getOpcode(), Idx, IncomingC->getValue(),
+                              IncomingC->getType());
+
+      // If we accumulate more folded cost for this incoming constant than
+      // materialized cost, then we'll regress any edge with this constant so
+      // just bail. We're only interested in cases where folding the incoming
+      // constants is at least break-even on all paths.
+      if (FoldedCost > MatCost) {
+        DEBUG(dbgs() << "  Not profitable to fold imm: " << *IncomingC << "\n"
+                        "    Materializing cost:    " << MatCost << "\n"
+                        "    Accumulated folded cost: " << FoldedCost << "\n");
+        return false;
+      }
+    }
+  }
+
+  // Compute the total cost savings afforded by this PHI node.
+  int TotalMatCost = TTI.TCC_Free, TotalFoldedCost = TTI.TCC_Free;
+  for (auto IncomingConstantAndCostsAndCount : CostsAndCounts) {
+    int MatCost = IncomingConstantAndCostsAndCount.second.MatCost;
+    int FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost;
+    int Count = IncomingConstantAndCostsAndCount.second.Count;
+
+    TotalMatCost += MatCost * Count;
+    TotalFoldedCost += FoldedCost * Count;
+  }
+  assert(TotalFoldedCost <= TotalMatCost && "If each constant's folded cost is "
+                                            "less that its materialized cost, "
+                                            "the sum must be as well.");
+
+  DEBUG(dbgs() << "    Cost savings " << (TotalMatCost - TotalFoldedCost)
+               << ": " << PN << "\n");
+  CostSavingsMap[&PN] = TotalMatCost - TotalFoldedCost;
+  return true;
+}
+
+/// Simple helper to walk all the users of a list of phis depth first, and call
+/// a visit function on each one in post-order.
+///
+/// All of the PHIs should be in the same basic block, and this is primarily
+/// used to make a single depth-first walk across their collective users
+/// without revisiting any subgraphs. Callers should provide a fast, idempotent
+/// callable to test whether a node has been visited and the more important
+/// callable to actually visit a particular node.
+///
+/// Depth-first and postorder here refer to the *operand* graph -- we start
+/// from a collection of users of PHI nodes and walk "up" the operands
+/// depth-first.
+template <typename IsVisitedT, typename VisitT>
+static void visitPHIUsersAndDepsInPostOrder(ArrayRef<PHINode *> PNs,
+                                            IsVisitedT IsVisited,
+                                            VisitT Visit) {
+  SmallVector<std::pair<Instruction *, User::value_op_iterator>, 16> DFSStack;
+  for (auto *PN : PNs)
+    for (Use &U : PN->uses()) {
+      auto *UI = cast<Instruction>(U.getUser());
+      if (IsVisited(UI))
+        // Already visited this user, continue across the roots.
+        continue;
+
+      // Otherwise, walk the operand graph depth-first and visit each
+      // dependency in postorder.
+      DFSStack.push_back({UI, UI->value_op_begin()});
+      do {
+        User::value_op_iterator OpIt;
+        std::tie(UI, OpIt) = DFSStack.pop_back_val();
+        while (OpIt != UI->value_op_end()) {
+          auto *OpI = dyn_cast<Instruction>(*OpIt);
+          // Increment to the next operand for whenever we continue.
+          ++OpIt;
+          // No need to visit non-instructions, which can't form dependencies,
+          // or instructions outside of our potential dependency set that we
+          // were given. Finally, if we've already visited the node, continue
+          // to the next.
+          if (!OpI || IsVisited(OpI))
+            continue;
+
+          // Push onto the stack and descend. We can directly continue this
+          // loop when ascending.
+          DFSStack.push_back({UI, OpIt});
+          UI = OpI;
+          OpIt = OpI->value_op_begin();
+        }
+
+        // Finished visiting children, visit this node.
+        assert(!IsVisited(UI) && "Should not have already visited a node!");
+        Visit(UI);
+      } while (!DFSStack.empty());
+    }
+}
+
+/// Find profitable PHIs to speculate.
+///
+/// For a PHI node to be profitable, we need the cost of speculating its users
+/// (and their dependencies) to not exceed the savings of folding the PHI's
+/// constant operands into the speculated users.
+///
+/// Computing this is surprisingly challenging. Because users of two different
+/// PHI nodes can depend on each other or on common other instructions, it may
+/// be profitable to speculate two PHI nodes together even though neither one
+/// in isolation is profitable. The straightforward way to find all the
+/// profitable PHIs would be to check each combination of PHIs' cost, but this
+/// is exponential in complexity.
+///
+/// Even if we assume that we only care about cases where we can consider each
+/// PHI node in isolation (rather than considering cases where none are
+/// profitable in isolation but some subset are profitable as a set), we still
+/// have a challenge. The obvious way to find all individually profitable PHIs
+/// is to iterate until reaching a fixed point, but this will be quadratic in
+/// complexity. =/
+///
+/// This code currently uses a linear-to-compute order for a greedy approach.
+/// It won't find cases where a set of PHIs must be considered together, but it
+/// handles most cases of order dependence without quadratic iteration. The
+/// specific order used is the post-order across the operand DAG. When the last
+/// user of a PHI is visited in this postorder walk, we check it for
+/// profitability.
+///
+/// There is an orthogonal extra complexity to all of this: computing the cost
+/// itself can easily become a linear computation making everything again (at
+/// best) quadratic. Using a postorder over the operand graph makes it
+/// particularly easy to avoid this through dynamic programming. As we do the
+/// postorder walk, we build the transitive cost of that subgraph. It is also
+/// straightforward to then update these costs when we mark a PHI for
+/// speculation so that subsequent PHIs don't re-pay the cost of already
+/// speculated instructions.
+static SmallVector<PHINode *, 16>
+findProfitablePHIs(ArrayRef<PHINode *> PNs,
+                   const SmallDenseMap<PHINode *, int, 16> &CostSavingsMap,
+                   const SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
+                   int NumPreds, DominatorTree &DT, TargetTransformInfo &TTI) {
+  SmallVector<PHINode *, 16> SpecPNs;
+
+  // First, establish a reverse mapping from immediate users of the PHI nodes
+  // to the nodes themselves, and count how many users each PHI node has in
+  // a way we can update while processing them.
+  SmallDenseMap<Instruction *, TinyPtrVector<PHINode *>, 16> UserToPNMap;
+  SmallDenseMap<PHINode *, int, 16> PNUserCountMap;
+  SmallPtrSet<Instruction *, 16> UserSet;
+  for (auto *PN : PNs) {
+    assert(UserSet.empty() && "Must start with an empty user set!");
+    for (Use &U : PN->uses())
+      UserSet.insert(cast<Instruction>(U.getUser()));
+    PNUserCountMap[PN] = UserSet.size();
+    for (auto *UI : UserSet)
+      UserToPNMap.insert({UI, {}}).first->second.push_back(PN);
+    UserSet.clear();
+  }
+
+  // Now do a DFS across the operand graph of the users, computing cost as we
+  // go and when all costs for a given PHI are known, checking that PHI for
+  // profitability.
+  SmallDenseMap<Instruction *, int, 16> SpecCostMap;
+  visitPHIUsersAndDepsInPostOrder(
+      PNs,
+      /*IsVisited*/
+      [&](Instruction *I) {
+        // We consider anything that isn't potentially speculated to be
+        // "visited" as it is already handled. Similarly, anything that *is*
+        // potentially speculated but for which we have an entry in our cost
+        // map, we're done.
+        return !PotentialSpecSet.count(I) || SpecCostMap.count(I);
+      },
+      /*Visit*/
+      [&](Instruction *I) {
+        // We've fully visited the operands, so sum their cost with this node
+        // and update the cost map.
+        int Cost = TTI.TCC_Free;
+        for (Value *OpV : I->operand_values())
+          if (auto *OpI = dyn_cast<Instruction>(OpV)) {
+            auto CostMapIt = SpecCostMap.find(OpI);
+            if (CostMapIt != SpecCostMap.end())
+              Cost += CostMapIt->second;
+          }
+        Cost += TTI.getUserCost(I);
+        bool Inserted = SpecCostMap.insert({I, Cost}).second;
+        (void)Inserted;
+        assert(Inserted && "Must not re-insert a cost during the DFS!");
+
+        // Now check if this node had a corresponding PHI node using it. If so,
+        // we need to decrement the outstanding user count for it.
+        auto UserPNsIt = UserToPNMap.find(I);
+        if (UserPNsIt == UserToPNMap.end())
+          return;
+        auto &UserPNs = UserPNsIt->second;
+        auto UserPNsSplitIt = std::stable_partition(
+            UserPNs.begin(), UserPNs.end(), [&](PHINode *UserPN) {
+              int &PNUserCount = PNUserCountMap.find(UserPN)->second;
+              assert(
+                  PNUserCount > 0 &&
+                  "Should never re-visit a PN after its user count hits zero!");
+              --PNUserCount;
+              return PNUserCount != 0;
+            });
+
+        // FIXME: Rather than one at a time, we should sum the savings as the
+        // cost will be completely shared.
+        SmallVector<Instruction *, 16> SpecWorklist;
+        for (auto *PN : llvm::make_range(UserPNsSplitIt, UserPNs.end())) {
+          int SpecCost = TTI.TCC_Free;
+          for (Use &U : PN->uses())
+            SpecCost +=
+                SpecCostMap.find(cast<Instruction>(U.getUser()))->second;
+          SpecCost *= (NumPreds - 1);
+          // When the user count of a PHI node hits zero, we should check its
+          // profitability. If profitable, we should mark it for speculation
+          // and zero out the cost of everything it depends on.
+          int CostSavings = CostSavingsMap.find(PN)->second;
+          if (SpecCost > CostSavings) {
+            DEBUG(dbgs() << "  Not profitable, speculation cost: " << *PN << "\n"
+                            "    Cost savings:     " << CostSavings << "\n"
+                            "    Speculation cost: " << SpecCost << "\n");
+            continue;
+          }
+
+          // We're going to speculate this user-associated PHI. Copy it out and
+          // add its users to the worklist to update their cost.
+          SpecPNs.push_back(PN);
+          for (Use &U : PN->uses()) {
+            auto *UI = cast<Instruction>(U.getUser());
+            auto CostMapIt = SpecCostMap.find(UI);
+            if (CostMapIt->second == 0)
+              continue;
+            // Zero out this cost entry to avoid duplicates.
+            CostMapIt->second = 0;
+            SpecWorklist.push_back(UI);
+          }
+        }
+
+        // Now walk all the operands of the users in the worklist transitively
+        // to zero out all the memoized costs.
+        while (!SpecWorklist.empty()) {
+          Instruction *SpecI = SpecWorklist.pop_back_val();
+          assert(SpecCostMap.find(SpecI)->second == 0 &&
+                 "Didn't zero out a cost!");
+
+          // Walk the operands recursively to zero out their cost as well.
+          for (auto *OpV : SpecI->operand_values()) {
+            auto *OpI = dyn_cast<Instruction>(OpV);
+            if (!OpI)
+              continue;
+            auto CostMapIt = SpecCostMap.find(OpI);
+            if (CostMapIt == SpecCostMap.end() || CostMapIt->second == 0)
+              continue;
+            CostMapIt->second = 0;
+            SpecWorklist.push_back(OpI);
+          }
+        }
+      });
+
+  return SpecPNs;
+}
+
+/// Speculate users around a set of PHI nodes.
+///
+/// This routine does the actual speculation around a set of PHI nodes where we
+/// have determined this to be both safe and profitable.
+///
+/// This routine handles any spliting of critical edges necessary to create
+/// a safe block to speculate into as well as cloning the instructions and
+/// rewriting all uses.
+static void speculatePHIs(ArrayRef<PHINode *> SpecPNs,
+                          SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
+                          SmallSetVector<BasicBlock *, 16> &PredSet,
+                          DominatorTree &DT) {
+  DEBUG(dbgs() << "  Speculating around " << SpecPNs.size() << " PHIs!\n");
+  NumPHIsSpeculated += SpecPNs.size();
+
+  // Split any critical edges so that we have a block to hoist into.
+  auto *ParentBB = SpecPNs[0]->getParent();
+  SmallVector<BasicBlock *, 16> SpecPreds;
+  SpecPreds.reserve(PredSet.size());
+  for (auto *PredBB : PredSet) {
+    auto *NewPredBB = SplitCriticalEdge(
+        PredBB, ParentBB,
+        CriticalEdgeSplittingOptions(&DT).setMergeIdenticalEdges());
+    if (NewPredBB) {
+      ++NumEdgesSplit;
+      DEBUG(dbgs() << "  Split critical edge from: " << PredBB->getName()
+                   << "\n");
+      SpecPreds.push_back(NewPredBB);
+    } else {
+      assert(PredBB->getSingleSuccessor() == ParentBB &&
+             "We need a non-critical predecessor to speculate into.");
+      assert(!isa<InvokeInst>(PredBB->getTerminator()) &&
+             "Cannot have a non-critical invoke!");
+
+      // Already non-critical, use existing pred.
+      SpecPreds.push_back(PredBB);
+    }
+  }
+
+  SmallPtrSet<Instruction *, 16> SpecSet;
+  SmallVector<Instruction *, 16> SpecList;
+  visitPHIUsersAndDepsInPostOrder(SpecPNs,
+                                  /*IsVisited*/
+                                  [&](Instruction *I) {
+                                    // This is visited if we don't need to
+                                    // speculate it or we already have
+                                    // speculated it.
+                                    return !PotentialSpecSet.count(I) ||
+                                           SpecSet.count(I);
+                                  },
+                                  /*Visit*/
+                                  [&](Instruction *I) {
+                                    // All operands scheduled, schedule this
+                                    // node.
+                                    SpecSet.insert(I);
+                                    SpecList.push_back(I);
+                                  });
+
+  int NumSpecInsts = SpecList.size() * SpecPreds.size();
+  int NumRedundantInsts = NumSpecInsts - SpecList.size();
+  DEBUG(dbgs() << "  Inserting " << NumSpecInsts << " speculated instructions, "
+               << NumRedundantInsts << " redundancies\n");
+  NumSpeculatedInstructions += NumSpecInsts;
+  NumNewRedundantInstructions += NumRedundantInsts;
+
+  // Each predecessor is numbered by its index in `SpecPreds`, so for each
+  // instruction we speculate, the speculated instruction is stored in that
+  // index of the vector asosciated with the original instruction. We also
+  // store the incoming values for each predecessor from any PHIs used.
+  SmallDenseMap<Instruction *, SmallVector<Value *, 2>, 16> SpeculatedValueMap;
+
+  // Inject the synthetic mappings to rewrite PHIs to the appropriate incoming
+  // value. This handles both the PHIs we are speculating around and any other
+  // PHIs that happen to be used.
+  for (auto *OrigI : SpecList)
+    for (auto *OpV : OrigI->operand_values()) {
+      auto *OpPN = dyn_cast<PHINode>(OpV);
+      if (!OpPN || OpPN->getParent() != ParentBB)
+        continue;
+
+      auto InsertResult = SpeculatedValueMap.insert({OpPN, {}});
+      if (!InsertResult.second)
+        continue;
+
+      auto &SpeculatedVals = InsertResult.first->second;
+
+      // Populating our structure for mapping is particularly annoying because
+      // finding an incoming value for a particular predecessor block in a PHI
+      // node is a linear time operation! To avoid quadratic behavior, we build
+      // a map for this PHI node's incoming values and then translate it into
+      // the more compact representation used below.
+      SmallDenseMap<BasicBlock *, Value *, 16> IncomingValueMap;
+      for (int i : llvm::seq<int>(0, OpPN->getNumIncomingValues()))
+        IncomingValueMap[OpPN->getIncomingBlock(i)] = OpPN->getIncomingValue(i);
+
+      for (auto *PredBB : SpecPreds)
+        SpeculatedVals.push_back(IncomingValueMap.find(PredBB)->second);
+    }
+
+  // Speculate into each predecessor.
+  for (int PredIdx : llvm::seq<int>(0, SpecPreds.size())) {
+    auto *PredBB = SpecPreds[PredIdx];
+    assert(PredBB->getSingleSuccessor() == ParentBB &&
+           "We need a non-critical predecessor to speculate into.");
+
+    for (auto *OrigI : SpecList) {
+      auto *NewI = OrigI->clone();
+      NewI->setName(Twine(OrigI->getName()) + "." + Twine(PredIdx));
+      NewI->insertBefore(PredBB->getTerminator());
+
+      // Rewrite all the operands to the previously speculated instructions.
+      // Because we're walking in-order, the defs must precede the uses and we
+      // should already have these mappings.
+      for (Use &U : NewI->operands()) {
+        auto *OpI = dyn_cast<Instruction>(U.get());
+        if (!OpI)
+          continue;
+        auto MapIt = SpeculatedValueMap.find(OpI);
+        if (MapIt == SpeculatedValueMap.end())
+          continue;
+        const auto &SpeculatedVals = MapIt->second;
+        assert(SpeculatedVals[PredIdx] &&
+               "Must have a speculated value for this predecessor!");
+        assert(SpeculatedVals[PredIdx]->getType() == OpI->getType() &&
+               "Speculated value has the wrong type!");
+
+        // Rewrite the use to this predecessor's speculated instruction.
+        U.set(SpeculatedVals[PredIdx]);
+      }
+
+      // Commute instructions which now have a constant in the LHS but not the
+      // RHS.
+      if (NewI->isBinaryOp() && NewI->isCommutative() &&
+          isa<Constant>(NewI->getOperand(0)) &&
+          !isa<Constant>(NewI->getOperand(1)))
+        NewI->getOperandUse(0).swap(NewI->getOperandUse(1));
+
+      SpeculatedValueMap[OrigI].push_back(NewI);
+      assert(SpeculatedValueMap[OrigI][PredIdx] == NewI &&
+             "Mismatched speculated instruction index!");
+    }
+  }
+
+  // Walk the speculated instruction list and if they have uses, insert a PHI
+  // for them from the speculated versions, and replace the uses with the PHI.
+  // Then erase the instructions as they have been fully speculated. The walk
+  // needs to be in reverse so that we don't think there are users when we'll
+  // actually eventually remove them later.
+  IRBuilder<> IRB(SpecPNs[0]);
+  for (auto *OrigI : llvm::reverse(SpecList)) {
+    // Check if we need a PHI for any remaining users and if so, insert it.
+    if (!OrigI->use_empty()) {
+      auto *SpecIPN = IRB.CreatePHI(OrigI->getType(), SpecPreds.size(),
+                                    Twine(OrigI->getName()) + ".phi");
+      // Add the incoming values we speculated.
+      auto &SpeculatedVals = SpeculatedValueMap.find(OrigI)->second;
+      for (int PredIdx : llvm::seq<int>(0, SpecPreds.size()))
+        SpecIPN->addIncoming(SpeculatedVals[PredIdx], SpecPreds[PredIdx]);
+
+      // And replace the uses with the PHI node.
+      OrigI->replaceAllUsesWith(SpecIPN);
+    }
+
+    // It is important to immediately erase this so that it stops using other
+    // instructions. This avoids inserting needless PHIs of them.
+    OrigI->eraseFromParent();
+  }
+
+  // All of the uses of the speculated phi nodes should be removed at this
+  // point, so erase them.
+  for (auto *SpecPN : SpecPNs) {
+    assert(SpecPN->use_empty() && "All users should have been speculated!");
+    SpecPN->eraseFromParent();
+  }
+}
+
+/// Try to speculate around a series of PHIs from a single basic block.
+///
+/// This routine checks whether any of these PHIs are profitable to speculate
+/// users around. If safe and profitable, it does the speculation. It returns
+/// true when at least some speculation occurs.
+static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs,
+                               DominatorTree &DT, TargetTransformInfo &TTI) {
+  DEBUG(dbgs() << "Evaluating phi nodes for speculation:\n");
+
+  // Savings in cost from speculating around a PHI node.
+  SmallDenseMap<PHINode *, int, 16> CostSavingsMap;
+
+  // Remember the set of instructions that are candidates for speculation so
+  // that we can quickly walk things within that space. This prunes out
+  // instructions already available along edges, etc.
+  SmallPtrSet<Instruction *, 16> PotentialSpecSet;
+
+  // Remember the set of instructions that are (transitively) unsafe to
+  // speculate into the incoming edges of this basic block. This avoids
+  // recomputing them for each PHI node we check. This set is specific to this
+  // block though as things are pruned out of it based on what is available
+  // along incoming edges.
+  SmallPtrSet<Instruction *, 16> UnsafeSet;
+
+  // For each PHI node in this block, check whether there are immediate folding
+  // opportunities from speculation, and whether that speculation will be
+  // valid. This determise the set of safe PHIs to speculate.
+  PNs.erase(llvm::remove_if(PNs,
+                            [&](PHINode *PN) {
+                              return !isSafeAndProfitableToSpeculateAroundPHI(
+                                  *PN, CostSavingsMap, PotentialSpecSet,
+                                  UnsafeSet, DT, TTI);
+                            }),
+            PNs.end());
+  // If no PHIs were profitable, skip.
+  if (PNs.empty()) {
+    DEBUG(dbgs() << "  No safe and profitable PHIs found!\n");
+    return false;
+  }
+
+  // We need to know how much speculation will cost which is determined by how
+  // many incoming edges will need a copy of each speculated instruction.
+  SmallSetVector<BasicBlock *, 16> PredSet;
+  for (auto *PredBB : PNs[0]->blocks()) {
+    if (!PredSet.insert(PredBB))
+      continue;
+
+    // We cannot speculate when a predecessor is an indirect branch.
+    // FIXME: We also can't reliably create a non-critical edge block for
+    // speculation if the predecessor is an invoke. This doesn't seem
+    // fundamental and we should probably be splitting critical edges
+    // differently.
+    if (isa<IndirectBrInst>(PredBB->getTerminator()) ||
+        isa<InvokeInst>(PredBB->getTerminator())) {
+      DEBUG(dbgs() << "  Invalid: predecessor terminator: " << PredBB->getName()
+                   << "\n");
+      return false;
+    }
+  }
+  if (PredSet.size() < 2) {
+    DEBUG(dbgs() << "  Unimportant: phi with only one predecessor\n");
+    return false;
+  }
+
+  SmallVector<PHINode *, 16> SpecPNs = findProfitablePHIs(
+      PNs, CostSavingsMap, PotentialSpecSet, PredSet.size(), DT, TTI);
+  if (SpecPNs.empty())
+    // Nothing to do.
+    return false;
+
+  speculatePHIs(SpecPNs, PotentialSpecSet, PredSet, DT);
+  return true;
+}
+
+PreservedAnalyses SpeculateAroundPHIsPass::run(Function &F,
+                                               FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+
+  bool Changed = false;
+  for (auto *BB : ReversePostOrderTraversal<Function *>(&F)) {
+    SmallVector<PHINode *, 16> PNs;
+    auto BBI = BB->begin();
+    while (auto *PN = dyn_cast<PHINode>(&*BBI)) {
+      PNs.push_back(PN);
+      ++BBI;
+    }
+
+    if (PNs.empty())
+      continue;
+
+    Changed |= tryToSpeculatePHIs(PNs, DT, TTI);
+  }
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  return PA;
+}
diff --git a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 8b8d6590aa6a..ce40af1223f6 100644
--- a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -1,4 +1,4 @@
-//===-- StraightLineStrengthReduce.cpp - ------------------------*- C++ -*-===//
+//===- StraightLineStrengthReduce.cpp - -----------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -55,26 +55,45 @@
 //
 // - When (i' - i) is constant but i and i' are not, we could still perform
 //   SLSR.
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <cstdint>
+#include <limits>
 #include <list>
 #include <vector>
 
 using namespace llvm;
 using namespace PatternMatch;
 
-namespace {
+static const unsigned UnknownAddressSpace =
+    std::numeric_limits<unsigned>::max();
 
-static const unsigned UnknownAddressSpace = ~0u;
+namespace {
 
 class StraightLineStrengthReduce : public FunctionPass {
 public:
@@ -88,20 +107,22 @@ public:
       GEP,     // &B[..][i * S][..]
     };
 
-    Candidate()
-        : CandidateKind(Invalid), Base(nullptr), Index(nullptr),
-          Stride(nullptr), Ins(nullptr), Basis(nullptr) {}
+    Candidate() = default;
     Candidate(Kind CT, const SCEV *B, ConstantInt *Idx, Value *S,
               Instruction *I)
-        : CandidateKind(CT), Base(B), Index(Idx), Stride(S), Ins(I),
-          Basis(nullptr) {}
-    Kind CandidateKind;
-    const SCEV *Base;
+        : CandidateKind(CT), Base(B), Index(Idx), Stride(S), Ins(I) {}
+
+    Kind CandidateKind = Invalid;
+
+    const SCEV *Base = nullptr;
+
     // Note that Index and Stride of a GEP candidate do not necessarily have the
     // same integer type. In that case, during rewriting, Stride will be
     // sign-extended or truncated to Index's type.
-    ConstantInt *Index;
-    Value *Stride;
+    ConstantInt *Index = nullptr;
+
+    Value *Stride = nullptr;
+
     // The instruction this candidate corresponds to. It helps us to rewrite a
     // candidate with respect to its immediate basis. Note that one instruction
     // can correspond to multiple candidates depending on how you associate the
@@ -116,16 +137,16 @@ public:
     // or
     //
     // <Base: b, Index: 2, Stride: a + 1>
-    Instruction *Ins;
+    Instruction *Ins = nullptr;
+
     // Points to the immediate basis of this candidate, or nullptr if we cannot
     // find any basis for this candidate.
-    Candidate *Basis;
+    Candidate *Basis = nullptr;
   };
 
   static char ID;
 
-  StraightLineStrengthReduce()
-      : FunctionPass(ID), DL(nullptr), DT(nullptr), TTI(nullptr) {
+  StraightLineStrengthReduce() : FunctionPass(ID) {
     initializeStraightLineStrengthReducePass(*PassRegistry::getPassRegistry());
   }
 
@@ -148,46 +169,58 @@ private:
   // Returns true if Basis is a basis for C, i.e., Basis dominates C and they
   // share the same base and stride.
   bool isBasisFor(const Candidate &Basis, const Candidate &C);
+
   // Returns whether the candidate can be folded into an addressing mode.
   bool isFoldable(const Candidate &C, TargetTransformInfo *TTI,
                   const DataLayout *DL);
+
   // Returns true if C is already in a simplest form and not worth being
   // rewritten.
   bool isSimplestForm(const Candidate &C);
+
   // Checks whether I is in a candidate form. If so, adds all the matching forms
   // to Candidates, and tries to find the immediate basis for each of them.
   void allocateCandidatesAndFindBasis(Instruction *I);
+
   // Allocate candidates and find bases for Add instructions.
   void allocateCandidatesAndFindBasisForAdd(Instruction *I);
+
   // Given I = LHS + RHS, factors RHS into i * S and makes (LHS + i * S) a
   // candidate.
   void allocateCandidatesAndFindBasisForAdd(Value *LHS, Value *RHS,
                                             Instruction *I);
   // Allocate candidates and find bases for Mul instructions.
   void allocateCandidatesAndFindBasisForMul(Instruction *I);
+
   // Splits LHS into Base + Index and, if succeeds, calls
   // allocateCandidatesAndFindBasis.
   void allocateCandidatesAndFindBasisForMul(Value *LHS, Value *RHS,
                                             Instruction *I);
+
   // Allocate candidates and find bases for GetElementPtr instructions.
   void allocateCandidatesAndFindBasisForGEP(GetElementPtrInst *GEP);
+
   // A helper function that scales Idx with ElementSize before invoking
   // allocateCandidatesAndFindBasis.
   void allocateCandidatesAndFindBasisForGEP(const SCEV *B, ConstantInt *Idx,
                                             Value *S, uint64_t ElementSize,
                                             Instruction *I);
+
   // Adds the given form <CT, B, Idx, S> to Candidates, and finds its immediate
   // basis.
   void allocateCandidatesAndFindBasis(Candidate::Kind CT, const SCEV *B,
                                       ConstantInt *Idx, Value *S,
                                       Instruction *I);
+
   // Rewrites candidate C with respect to Basis.
   void rewriteCandidateWithBasis(const Candidate &C, const Candidate &Basis);
+
   // A helper function that factors ArrayIdx to a product of a stride and a
   // constant index, and invokes allocateCandidatesAndFindBasis with the
   // factorings.
   void factorArrayIndex(Value *ArrayIdx, const SCEV *Base, uint64_t ElementSize,
                         GetElementPtrInst *GEP);
+
   // Emit code that computes the "bump" from Basis to C. If the candidate is a
   // GEP and the bump is not divisible by the element size of the GEP, this
   // function sets the BumpWithUglyGEP flag to notify its caller to bump the
@@ -196,19 +229,22 @@ private:
                          IRBuilder<> &Builder, const DataLayout *DL,
                          bool &BumpWithUglyGEP);
 
-  const DataLayout *DL;
-  DominatorTree *DT;
+  const DataLayout *DL = nullptr;
+  DominatorTree *DT = nullptr;
   ScalarEvolution *SE;
-  TargetTransformInfo *TTI;
+  TargetTransformInfo *TTI = nullptr;
   std::list<Candidate> Candidates;
+
   // Temporarily holds all instructions that are unlinked (but not deleted) by
   // rewriteCandidateWithBasis. These instructions will be actually removed
   // after all rewriting finishes.
   std::vector<Instruction *> UnlinkedInstructions;
 };
-}  // anonymous namespace
+
+} // end anonymous namespace
 
 char StraightLineStrengthReduce::ID = 0;
+
 INITIALIZE_PASS_BEGIN(StraightLineStrengthReduce, "slsr",
                       "Straight line strength reduction", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
@@ -650,8 +686,8 @@ void StraightLineStrengthReduce::rewriteCandidateWithBasis(
         else
           Reduced = Builder.CreateGEP(nullptr, Basis.Ins, Bump);
       }
+      break;
     }
-    break;
   default:
     llvm_unreachable("C.CandidateKind is invalid");
   };
diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp
index 0cccb415efdb..2972e1cff9a4 100644
--- a/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -1,4 +1,4 @@
-//===-- StructurizeCFG.cpp ------------------------------------------------===//
+//===- StructurizeCFG.cpp -------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,49 +7,72 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/RegionIterator.h"
 #include "llvm/Analysis/RegionPass.h"
-#include "llvm/IR/Module.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <algorithm>
+#include <cassert>
+#include <utility>
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "structurizecfg"
 
+// The name for newly created blocks.
+static const char *const FlowBlockName = "Flow";
+
 namespace {
 
 // Definition of the complex types used in this pass.
 
-typedef std::pair<BasicBlock *, Value *> BBValuePair;
+using BBValuePair = std::pair<BasicBlock *, Value *>;
 
-typedef SmallVector<RegionNode*, 8> RNVector;
-typedef SmallVector<BasicBlock*, 8> BBVector;
-typedef SmallVector<BranchInst*, 8> BranchVector;
-typedef SmallVector<BBValuePair, 2> BBValueVector;
+using RNVector = SmallVector<RegionNode *, 8>;
+using BBVector = SmallVector<BasicBlock *, 8>;
+using BranchVector = SmallVector<BranchInst *, 8>;
+using BBValueVector = SmallVector<BBValuePair, 2>;
 
-typedef SmallPtrSet<BasicBlock *, 8> BBSet;
+using BBSet = SmallPtrSet<BasicBlock *, 8>;
 
-typedef MapVector<PHINode *, BBValueVector> PhiMap;
-typedef MapVector<BasicBlock *, BBVector> BB2BBVecMap;
+using PhiMap = MapVector<PHINode *, BBValueVector>;
+using BB2BBVecMap = MapVector<BasicBlock *, BBVector>;
 
-typedef DenseMap<BasicBlock *, PhiMap> BBPhiMap;
-typedef DenseMap<BasicBlock *, Value *> BBPredicates;
-typedef DenseMap<BasicBlock *, BBPredicates> PredMap;
-typedef DenseMap<BasicBlock *, BasicBlock*> BB2BBMap;
-
-// The name for newly created blocks.
-static const char *const FlowBlockName = "Flow";
+using BBPhiMap = DenseMap<BasicBlock *, PhiMap>;
+using BBPredicates = DenseMap<BasicBlock *, Value *>;
+using PredMap = DenseMap<BasicBlock *, BBPredicates>;
+using BB2BBMap = DenseMap<BasicBlock *, BasicBlock *>;
 
 /// Finds the nearest common dominator of a set of BasicBlocks.
 ///
@@ -736,7 +759,6 @@ void StructurizeCFG::wireFlow(bool ExitUseAllowed,
       changeExit(PrevNode, Node->getEntry(), true);
     }
     PrevNode = Node;
-
   } else {
     // Insert extra prefix node (or reuse last one)
     BasicBlock *Flow = needPrefix(false);
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 90c5c243f464..2a1106b41de2 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -60,6 +60,7 @@
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/CallSite.h"
@@ -78,7 +79,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "tailcallelim"
@@ -177,7 +177,8 @@ struct AllocaDerivedValueTracker {
 };
 }
 
-static bool markTails(Function &F, bool &AllCallsAreTailCalls) {
+static bool markTails(Function &F, bool &AllCallsAreTailCalls,
+                      OptimizationRemarkEmitter *ORE) {
   if (F.callsFunctionThatReturnsTwice())
     return false;
   AllCallsAreTailCalls = true;
@@ -228,7 +229,7 @@ static bool markTails(Function &F, bool &AllCallsAreTailCalls) {
         Escaped = ESCAPED;
 
       CallInst *CI = dyn_cast<CallInst>(&I);
-      if (!CI || CI->isTailCall())
+      if (!CI || CI->isTailCall() || isa<DbgInfoIntrinsic>(&I))
         continue;
 
       bool IsNoTail = CI->isNoTailCall() || CI->hasOperandBundles();
@@ -252,9 +253,11 @@ static bool markTails(Function &F, bool &AllCallsAreTailCalls) {
           break;
         }
         if (SafeToTail) {
-          emitOptimizationRemark(
-              F.getContext(), "tailcallelim", F, CI->getDebugLoc(),
-              "marked this readnone call a tail call candidate");
+          using namespace ore;
+          ORE->emit([&]() {
+            return OptimizationRemark(DEBUG_TYPE, "tailcall-readnone", CI)
+                   << "marked as tail call candidate (readnone)";
+          });
           CI->setTailCall();
           Modified = true;
           continue;
@@ -299,9 +302,7 @@ static bool markTails(Function &F, bool &AllCallsAreTailCalls) {
     if (Visited[CI->getParent()] != ESCAPED) {
       // If the escape point was part way through the block, calls after the
       // escape point wouldn't have been put into DeferredTails.
-      emitOptimizationRemark(F.getContext(), "tailcallelim", F,
-                             CI->getDebugLoc(),
-                             "marked this call a tail call candidate");
+      DEBUG(dbgs() << "Marked as tail call candidate: " << *CI << "\n");
       CI->setTailCall();
       Modified = true;
     } else {
@@ -330,7 +331,7 @@ static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) {
       // Writes to memory only matter if they may alias the pointer
       // being loaded from.
       const DataLayout &DL = L->getModule()->getDataLayout();
-      if ((AA->getModRefInfo(CI, MemoryLocation::get(L)) & MRI_Mod) ||
+      if (isModSet(AA->getModRefInfo(CI, MemoryLocation::get(L))) ||
           !isSafeToLoadUnconditionally(L->getPointerOperand(),
                                        L->getAlignment(), DL, L))
         return false;
@@ -491,7 +492,8 @@ static bool eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
                                        BasicBlock *&OldEntry,
                                        bool &TailCallsAreMarkedTail,
                                        SmallVectorImpl<PHINode *> &ArgumentPHIs,
-                                       AliasAnalysis *AA) {
+                                       AliasAnalysis *AA,
+                                       OptimizationRemarkEmitter *ORE) {
   // If we are introducing accumulator recursion to eliminate operations after
   // the call instruction that are both associative and commutative, the initial
   // value for the accumulator is placed in this variable.  If this value is set
@@ -551,8 +553,11 @@ static bool eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
   BasicBlock *BB = Ret->getParent();
   Function *F = BB->getParent();
 
-  emitOptimizationRemark(F->getContext(), "tailcallelim", *F, CI->getDebugLoc(),
-                         "transforming tail recursion to loop");
+  using namespace ore;
+  ORE->emit([&]() {
+    return OptimizationRemark(DEBUG_TYPE, "tailcall-recursion", CI)
+           << "transforming tail recursion into loop";
+  });
 
   // OK! We can transform this tail call.  If this is the first one found,
   // create the new entry block, allowing us to branch back to the old entry.
@@ -666,13 +671,11 @@ static bool eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
   return true;
 }
 
-static bool foldReturnAndProcessPred(BasicBlock *BB, ReturnInst *Ret,
-                                     BasicBlock *&OldEntry,
-                                     bool &TailCallsAreMarkedTail,
-                                     SmallVectorImpl<PHINode *> &ArgumentPHIs,
-                                     bool CannotTailCallElimCallsMarkedTail,
-                                     const TargetTransformInfo *TTI,
-                                     AliasAnalysis *AA) {
+static bool foldReturnAndProcessPred(
+    BasicBlock *BB, ReturnInst *Ret, BasicBlock *&OldEntry,
+    bool &TailCallsAreMarkedTail, SmallVectorImpl<PHINode *> &ArgumentPHIs,
+    bool CannotTailCallElimCallsMarkedTail, const TargetTransformInfo *TTI,
+    AliasAnalysis *AA, OptimizationRemarkEmitter *ORE) {
   bool Change = false;
 
   // Make sure this block is a trivial return block.
@@ -708,7 +711,7 @@ static bool foldReturnAndProcessPred(BasicBlock *BB, ReturnInst *Ret,
         BB->eraseFromParent();
 
       eliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail,
-                                 ArgumentPHIs, AA);
+                                 ArgumentPHIs, AA, ORE);
       ++NumRetDuped;
       Change = true;
     }
@@ -722,23 +725,25 @@ static bool processReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
                                   SmallVectorImpl<PHINode *> &ArgumentPHIs,
                                   bool CannotTailCallElimCallsMarkedTail,
                                   const TargetTransformInfo *TTI,
-                                  AliasAnalysis *AA) {
+                                  AliasAnalysis *AA,
+                                  OptimizationRemarkEmitter *ORE) {
   CallInst *CI = findTRECandidate(Ret, CannotTailCallElimCallsMarkedTail, TTI);
   if (!CI)
     return false;
 
   return eliminateRecursiveTailCall(CI, Ret, OldEntry, TailCallsAreMarkedTail,
-                                    ArgumentPHIs, AA);
+                                    ArgumentPHIs, AA, ORE);
 }
 
 static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI,
-                                   AliasAnalysis *AA) {
+                                   AliasAnalysis *AA,
+                                   OptimizationRemarkEmitter *ORE) {
   if (F.getFnAttribute("disable-tail-calls").getValueAsString() == "true")
     return false;
 
   bool MadeChange = false;
   bool AllCallsAreTailCalls = false;
-  MadeChange |= markTails(F, AllCallsAreTailCalls);
+  MadeChange |= markTails(F, AllCallsAreTailCalls, ORE);
   if (!AllCallsAreTailCalls)
     return MadeChange;
 
@@ -765,13 +770,13 @@ static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI,
   for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; /*in loop*/) {
     BasicBlock *BB = &*BBI++; // foldReturnAndProcessPred may delete BB.
     if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
-      bool Change =
-          processReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
-                                ArgumentPHIs, !CanTRETailMarkedCall, TTI, AA);
+      bool Change = processReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
+                                          ArgumentPHIs, !CanTRETailMarkedCall,
+                                          TTI, AA, ORE);
       if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
         Change = foldReturnAndProcessPred(BB, Ret, OldEntry,
                                           TailCallsAreMarkedTail, ArgumentPHIs,
-                                          !CanTRETailMarkedCall, TTI, AA);
+                                          !CanTRETailMarkedCall, TTI, AA, ORE);
       MadeChange |= Change;
     }
   }
@@ -802,6 +807,7 @@ struct TailCallElim : public FunctionPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetTransformInfoWrapperPass>();
     AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
 
@@ -811,7 +817,8 @@ struct TailCallElim : public FunctionPass {
 
     return eliminateTailRecursion(
         F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F),
-        &getAnalysis<AAResultsWrapperPass>().getAAResults());
+        &getAnalysis<AAResultsWrapperPass>().getAAResults(),
+        &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE());
   }
 };
 }
@@ -820,6 +827,7 @@ char TailCallElim::ID = 0;
 INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim", "Tail Call Elimination",
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 INITIALIZE_PASS_END(TailCallElim, "tailcallelim", "Tail Call Elimination",
                     false, false)
 
@@ -833,8 +841,9 @@ PreservedAnalyses TailCallElimPass::run(Function &F,
 
   TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
   AliasAnalysis &AA = AM.getResult<AAManager>(F);
+  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
 
-  bool Changed = eliminateTailRecursion(F, &TTI, &AA);
+  bool Changed = eliminateTailRecursion(F, &TTI, &AA, &ORE);
 
   if (!Changed)
     return PreservedAnalyses::all();
author	Dimitry Andric <dim@FreeBSD.org>	2017-12-18 20:10:56 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2017-12-18 20:10:56 +0000
commit	044eb2f6afba375a914ac9d8024f8f5142bb912e (patch)
tree	1475247dc9f9fe5be155ebd4c9069c75aadf8c20 /lib/Transforms/Scalar
parent	eb70dddbd77e120e5d490bd8fbe7ff3f8fa81c6b (diff)