55 files changed, 4653 insertions, 1569 deletions
diff --git a/lib/Transforms/Utils/AddDiscriminators.cpp b/lib/Transforms/Utils/AddDiscriminators.cpp
index 0f0668f24db5..e3ef42362223 100644
--- a/lib/Transforms/Utils/AddDiscriminators.cpp
+++ b/lib/Transforms/Utils/AddDiscriminators.cpp
@@ -69,7 +69,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include <utility>
 
 using namespace llvm;
@@ -114,7 +114,7 @@ static bool shouldHaveDiscriminator(const Instruction *I) {
   return !isa<IntrinsicInst>(I) || isa<MemIntrinsic>(I);
 }
 
-/// \brief Assign DWARF discriminators.
+/// Assign DWARF discriminators.
 ///
 /// To assign discriminators, we examine the boundaries of every
 /// basic block and its successors. Suppose there is a basic block B1
@@ -210,9 +210,9 @@ static bool addDiscriminators(Function &F) {
       // it in 1 byte ULEB128 representation.
       unsigned Discriminator = R.second ? ++LDM[L] : LDM[L];
       I.setDebugLoc(DIL->setBaseDiscriminator(Discriminator));
-      DEBUG(dbgs() << DIL->getFilename() << ":" << DIL->getLine() << ":"
-                   << DIL->getColumn() << ":" << Discriminator << " " << I
-                   << "\n");
+      LLVM_DEBUG(dbgs() << DIL->getFilename() << ":" << DIL->getLine() << ":"
+                        << DIL->getColumn() << ":" << Discriminator << " " << I
+                        << "\n");
       Changed = true;
     }
   }
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index 606bd8baccaa..516a785dce1e 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
@@ -36,7 +37,6 @@
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstdint>
 #include <string>
@@ -45,16 +45,22 @@
 
 using namespace llvm;
 
-void llvm::DeleteDeadBlock(BasicBlock *BB) {
+void llvm::DeleteDeadBlock(BasicBlock *BB, DeferredDominance *DDT) {
   assert((pred_begin(BB) == pred_end(BB) ||
          // Can delete self loop.
          BB->getSinglePredecessor() == BB) && "Block is not dead!");
   TerminatorInst *BBTerm = BB->getTerminator();
+  std::vector<DominatorTree::UpdateType> Updates;
 
   // Loop through all of our successors and make sure they know that one
   // of their predecessors is going away.
-  for (BasicBlock *Succ : BBTerm->successors())
+  if (DDT)
+    Updates.reserve(BBTerm->getNumSuccessors());
+  for (BasicBlock *Succ : BBTerm->successors()) {
     Succ->removePredecessor(BB);
+    if (DDT)
+      Updates.push_back({DominatorTree::Delete, BB, Succ});
+  }
 
   // Zap all the instructions in the block.
   while (!BB->empty()) {
@@ -69,8 +75,12 @@ void llvm::DeleteDeadBlock(BasicBlock *BB) {
     BB->getInstList().pop_back();
   }
 
-  // Zap the block!
-  BB->eraseFromParent();
+  if (DDT) {
+    DDT->applyUpdates(Updates);
+    DDT->deleteBB(BB); // Deferred deletion of BB.
+  } else {
+    BB->eraseFromParent(); // Zap the block!
+  }
 }
 
 void llvm::FoldSingleEntryPHINodes(BasicBlock *BB,
@@ -94,9 +104,8 @@ bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) {
   // Recursively deleting a PHI may cause multiple PHIs to be deleted
   // or RAUW'd undef, so use an array of WeakTrackingVH for the PHIs to delete.
   SmallVector<WeakTrackingVH, 8> PHIs;
-  for (BasicBlock::iterator I = BB->begin();
-       PHINode *PN = dyn_cast<PHINode>(I); ++I)
-    PHIs.push_back(PN);
+  for (PHINode &PN : BB->phis())
+    PHIs.push_back(&PN);
 
   bool Changed = false;
   for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
@@ -108,9 +117,12 @@ bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) {
 
 bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT,
                                      LoopInfo *LI,
-                                     MemoryDependenceResults *MemDep) {
-  // Don't merge away blocks who have their address taken.
-  if (BB->hasAddressTaken()) return false;
+                                     MemoryDependenceResults *MemDep,
+                                     DeferredDominance *DDT) {
+  assert(!(DT && DDT) && "Cannot call with both DT and DDT.");
+
+  if (BB->hasAddressTaken())
+    return false;
 
   // Can't merge if there are multiple predecessors, or no predecessors.
   BasicBlock *PredBB = BB->getUniquePredecessor();
@@ -122,39 +134,38 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT,
   if (PredBB->getTerminator()->isExceptional())
     return false;
 
-  succ_iterator SI(succ_begin(PredBB)), SE(succ_end(PredBB));
-  BasicBlock *OnlySucc = BB;
-  for (; SI != SE; ++SI)
-    if (*SI != OnlySucc) {
-      OnlySucc = nullptr;     // There are multiple distinct successors!
-      break;
-    }
-
-  // Can't merge if there are multiple successors.
-  if (!OnlySucc) return false;
+  // Can't merge if there are multiple distinct successors.
+  if (PredBB->getUniqueSuccessor() != BB)
+    return false;
 
   // Can't merge if there is PHI loop.
-  for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) {
-    if (PHINode *PN = dyn_cast<PHINode>(BI)) {
-      for (Value *IncValue : PN->incoming_values())
-        if (IncValue == PN)
-          return false;
-    } else
-      break;
-  }
+  for (PHINode &PN : BB->phis())
+    for (Value *IncValue : PN.incoming_values())
+      if (IncValue == &PN)
+        return false;
 
   // Begin by getting rid of unneeded PHIs.
-  SmallVector<Value *, 4> IncomingValues;
+  SmallVector<AssertingVH<Value>, 4> IncomingValues;
   if (isa<PHINode>(BB->front())) {
-    for (auto &I : *BB)
-      if (PHINode *PN = dyn_cast<PHINode>(&I)) {
-        if (PN->getIncomingValue(0) != PN)
-          IncomingValues.push_back(PN->getIncomingValue(0));
-      } else
-        break;
+    for (PHINode &PN : BB->phis())
+      if (!isa<PHINode>(PN.getIncomingValue(0)) ||
+          cast<PHINode>(PN.getIncomingValue(0))->getParent() != BB)
+        IncomingValues.push_back(PN.getIncomingValue(0));
     FoldSingleEntryPHINodes(BB, MemDep);
   }
 
+  // Deferred DT update: Collect all the edges that exit BB. These
+  // dominator edges will be redirected from Pred.
+  std::vector<DominatorTree::UpdateType> Updates;
+  if (DDT) {
+    Updates.reserve(1 + (2 * succ_size(BB)));
+    Updates.push_back({DominatorTree::Delete, PredBB, BB});
+    for (auto I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
+      Updates.push_back({DominatorTree::Delete, BB, *I});
+      Updates.push_back({DominatorTree::Insert, PredBB, *I});
+    }
+  }
+
   // Delete the unconditional branch from the predecessor...
   PredBB->getInstList().pop_back();
 
@@ -166,8 +177,8 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT,
   PredBB->getInstList().splice(PredBB->end(), BB->getInstList());
 
   // Eliminate duplicate dbg.values describing the entry PHI node post-splice.
-  for (auto *Incoming : IncomingValues) {
-    if (isa<Instruction>(Incoming)) {
+  for (auto Incoming : IncomingValues) {
+    if (isa<Instruction>(*Incoming)) {
       SmallVector<DbgValueInst *, 2> DbgValues;
       SmallDenseSet<std::pair<DILocalVariable *, DIExpression *>, 2>
           DbgValueSet;
@@ -201,7 +212,12 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT,
   if (MemDep)
     MemDep->invalidateCachedPredecessors();
 
-  BB->eraseFromParent();
+  if (DDT) {
+    DDT->deleteBB(BB); // Deferred deletion of BB.
+    DDT->applyUpdates(Updates);
+  } else {
+    BB->eraseFromParent(); // Nuke BB.
+  }
   return true;
 }
 
@@ -317,13 +333,21 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
                                       DominatorTree *DT, LoopInfo *LI,
                                       bool PreserveLCSSA, bool &HasLoopExit) {
   // Update dominator tree if available.
-  if (DT)
-    DT->splitBlock(NewBB);
+  if (DT) {
+    if (OldBB == DT->getRootNode()->getBlock()) {
+      assert(NewBB == &NewBB->getParent()->getEntryBlock());
+      DT->setNewRoot(NewBB);
+    } else {
+      // Split block expects NewBB to have a non-empty set of predecessors.
+      DT->splitBlock(NewBB);
+    }
+  }
 
   // The rest of the logic is only relevant for updating the loop structures.
   if (!LI)
     return;
 
+  assert(DT && "DT should be available to update LoopInfo!");
   Loop *L = LI->getLoopFor(OldBB);
 
   // If we need to preserve loop analyses, collect some information about how
@@ -331,6 +355,12 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
   bool IsLoopEntry = !!L;
   bool SplitMakesNewLoopHeader = false;
   for (BasicBlock *Pred : Preds) {
+    // Preds that are not reachable from entry should not be used to identify if
+    // OldBB is a loop entry or if SplitMakesNewLoopHeader. Unreachable blocks
+    // are not within any loops, so we incorrectly mark SplitMakesNewLoopHeader
+    // as true and make the NewBB the header of some loop. This breaks LI.
+    if (!DT->isReachableFromEntry(Pred))
+      continue;
     // If we need to preserve LCSSA, determine if any of the preds is a loop
     // exit.
     if (PreserveLCSSA)
@@ -495,7 +525,6 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
     // Insert dummy values as the incoming value.
     for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++I)
       cast<PHINode>(I)->addIncoming(UndefValue::get(I->getType()), NewBB);
-    return NewBB;
   }
 
   // Update DominatorTree, LoopInfo, and LCCSA analysis information.
@@ -503,8 +532,11 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
   UpdateAnalysisInformation(BB, NewBB, Preds, DT, LI, PreserveLCSSA,
                             HasLoopExit);
 
-  // Update the PHI nodes in BB with the values coming from NewBB.
-  UpdatePHINodes(BB, NewBB, Preds, BI, HasLoopExit);
+  if (!Preds.empty()) {
+    // Update the PHI nodes in BB with the values coming from NewBB.
+    UpdatePHINodes(BB, NewBB, Preds, BI, HasLoopExit);
+  }
+
   return NewBB;
 }
 
diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 3653c307619b..3e30c27a9f33 100644
--- a/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -28,7 +28,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
@@ -106,10 +106,9 @@ static void createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds,
           SplitBB->isLandingPad()) && "SplitBB has non-PHI nodes!");
 
   // For each PHI in the destination block.
-  for (BasicBlock::iterator I = DestBB->begin();
-       PHINode *PN = dyn_cast<PHINode>(I); ++I) {
-    unsigned Idx = PN->getBasicBlockIndex(SplitBB);
-    Value *V = PN->getIncomingValue(Idx);
+  for (PHINode &PN : DestBB->phis()) {
+    unsigned Idx = PN.getBasicBlockIndex(SplitBB);
+    Value *V = PN.getIncomingValue(Idx);
 
     // If the input is a PHI which already satisfies LCSSA, don't create
     // a new one.
@@ -119,13 +118,13 @@ static void createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds,
 
     // Otherwise a new PHI is needed. Create one and populate it.
     PHINode *NewPN = PHINode::Create(
-        PN->getType(), Preds.size(), "split",
+        PN.getType(), Preds.size(), "split",
         SplitBB->isLandingPad() ? &SplitBB->front() : SplitBB->getTerminator());
     for (unsigned i = 0, e = Preds.size(); i != e; ++i)
       NewPN->addIncoming(V, Preds[i]);
 
     // Update the original PHI.
-    PN->setIncomingValue(Idx, NewPN);
+    PN.setIncomingValue(Idx, NewPN);
   }
 }
 
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index b60dfb4f3541..5f5c4150d3bb 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -105,12 +105,23 @@ static bool setRetNonNull(Function &F) {
   return true;
 }
 
+static bool setNonLazyBind(Function &F) {
+  if (F.hasFnAttribute(Attribute::NonLazyBind))
+    return false;
+  F.addFnAttr(Attribute::NonLazyBind);
+  return true;
+}
+
 bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   LibFunc TheLibFunc;
   if (!(TLI.getLibFunc(F, TheLibFunc) && TLI.has(TheLibFunc)))
     return false;
 
   bool Changed = false;
+
+  if (F.getParent() != nullptr && F.getParent()->getRtLibUseGOT())
+    Changed |= setNonLazyBind(F);
+
   switch (TheLibFunc) {
   case LibFunc_strlen:
   case LibFunc_wcslen:
@@ -375,6 +386,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   case LibFunc_fseek:
   case LibFunc_ftell:
   case LibFunc_fgetc:
+  case LibFunc_fgetc_unlocked:
   case LibFunc_fseeko:
   case LibFunc_ftello:
   case LibFunc_fileno:
@@ -393,6 +405,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setOnlyReadsMemory(F);
     return Changed;
   case LibFunc_fputc:
+  case LibFunc_fputc_unlocked:
   case LibFunc_fstat:
   case LibFunc_frexp:
   case LibFunc_frexpf:
@@ -402,21 +415,25 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
   case LibFunc_fgets:
+  case LibFunc_fgets_unlocked:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 2);
     return Changed;
   case LibFunc_fread:
+  case LibFunc_fread_unlocked:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 3);
     return Changed;
   case LibFunc_fwrite:
+  case LibFunc_fwrite_unlocked:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 3);
     // FIXME: readonly #1?
     return Changed;
   case LibFunc_fputs:
+  case LibFunc_fputs_unlocked:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
@@ -447,6 +464,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     return Changed;
   case LibFunc_gets:
   case LibFunc_getchar:
+  case LibFunc_getchar_unlocked:
     Changed |= setDoesNotThrow(F);
     return Changed;
   case LibFunc_getitimer:
@@ -485,6 +503,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   case LibFunc_putc:
+  case LibFunc_putc_unlocked:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
@@ -505,6 +524,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   case LibFunc_putchar:
+  case LibFunc_putchar_unlocked:
     Changed |= setDoesNotThrow(F);
     return Changed;
   case LibFunc_popen:
@@ -687,9 +707,9 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setRetNonNull(F);
     Changed |= setRetDoesNotAlias(F);
     return Changed;
-  //TODO: add LibFunc entries for:
-  //case LibFunc_memset_pattern4:
-  //case LibFunc_memset_pattern8:
+  // TODO: add LibFunc entries for:
+  // case LibFunc_memset_pattern4:
+  // case LibFunc_memset_pattern8:
   case LibFunc_memset_pattern16:
     Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setDoesNotCapture(F, 0);
@@ -709,6 +729,19 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   }
 }
 
+bool llvm::hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
+                           LibFunc DoubleFn, LibFunc FloatFn,
+                           LibFunc LongDoubleFn) {
+  switch (Ty->getTypeID()) {
+  case Type::FloatTyID:
+    return TLI->has(FloatFn);
+  case Type::DoubleTyID:
+    return TLI->has(DoubleFn);
+  default:
+    return TLI->has(LongDoubleFn);
+  }
+}
+
 //- Emit LibCalls ------------------------------------------------------------//
 
 Value *llvm::castToCStr(Value *V, IRBuilder<> &B) {
@@ -973,6 +1006,24 @@ Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilder<> &B,
   return CI;
 }
 
+Value *llvm::emitFPutCUnlocked(Value *Char, Value *File, IRBuilder<> &B,
+                               const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_fputc_unlocked))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  Constant *F = M->getOrInsertFunction("fputc_unlocked", B.getInt32Ty(),
+                                       B.getInt32Ty(), File->getType());
+  if (File->getType()->isPointerTy())
+    inferLibFuncAttributes(*M->getFunction("fputc_unlocked"), *TLI);
+  Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/ true, "chari");
+  CallInst *CI = B.CreateCall(F, {Char, File}, "fputc_unlocked");
+
+  if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
+    CI->setCallingConv(Fn->getCallingConv());
+  return CI;
+}
+
 Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilder<> &B,
                        const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc_fputs))
@@ -991,6 +1042,24 @@ Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilder<> &B,
   return CI;
 }
 
+Value *llvm::emitFPutSUnlocked(Value *Str, Value *File, IRBuilder<> &B,
+                               const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_fputs_unlocked))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  StringRef FPutsUnlockedName = TLI->getName(LibFunc_fputs_unlocked);
+  Constant *F = M->getOrInsertFunction(FPutsUnlockedName, B.getInt32Ty(),
+                                       B.getInt8PtrTy(), File->getType());
+  if (File->getType()->isPointerTy())
+    inferLibFuncAttributes(*M->getFunction(FPutsUnlockedName), *TLI);
+  CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, "fputs_unlocked");
+
+  if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
+    CI->setCallingConv(Fn->getCallingConv());
+  return CI;
+}
+
 Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B,
                         const DataLayout &DL, const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc_fwrite))
@@ -1013,3 +1082,119 @@ Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B,
     CI->setCallingConv(Fn->getCallingConv());
   return CI;
 }
+
+Value *llvm::emitMalloc(Value *Num, IRBuilder<> &B, const DataLayout &DL,
+                        const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_malloc))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  Value *Malloc = M->getOrInsertFunction("malloc", B.getInt8PtrTy(),
+                                         DL.getIntPtrType(Context));
+  inferLibFuncAttributes(*M->getFunction("malloc"), *TLI);
+  CallInst *CI = B.CreateCall(Malloc, Num, "malloc");
+
+  if (const Function *F = dyn_cast<Function>(Malloc->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+
+  return CI;
+}
+
+Value *llvm::emitCalloc(Value *Num, Value *Size, const AttributeList &Attrs,
+                        IRBuilder<> &B, const TargetLibraryInfo &TLI) {
+  if (!TLI.has(LibFunc_calloc))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  const DataLayout &DL = M->getDataLayout();
+  IntegerType *PtrType = DL.getIntPtrType((B.GetInsertBlock()->getContext()));
+  Value *Calloc = M->getOrInsertFunction("calloc", Attrs, B.getInt8PtrTy(),
+                                         PtrType, PtrType);
+  inferLibFuncAttributes(*M->getFunction("calloc"), TLI);
+  CallInst *CI = B.CreateCall(Calloc, {Num, Size}, "calloc");
+
+  if (const auto *F = dyn_cast<Function>(Calloc->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+
+  return CI;
+}
+
+Value *llvm::emitFWriteUnlocked(Value *Ptr, Value *Size, Value *N, Value *File,
+                                IRBuilder<> &B, const DataLayout &DL,
+                                const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_fwrite_unlocked))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  StringRef FWriteUnlockedName = TLI->getName(LibFunc_fwrite_unlocked);
+  Constant *F = M->getOrInsertFunction(
+      FWriteUnlockedName, DL.getIntPtrType(Context), B.getInt8PtrTy(),
+      DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType());
+
+  if (File->getType()->isPointerTy())
+    inferLibFuncAttributes(*M->getFunction(FWriteUnlockedName), *TLI);
+  CallInst *CI = B.CreateCall(F, {castToCStr(Ptr, B), Size, N, File});
+
+  if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
+    CI->setCallingConv(Fn->getCallingConv());
+  return CI;
+}
+
+Value *llvm::emitFGetCUnlocked(Value *File, IRBuilder<> &B,
+                               const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_fgetc_unlocked))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  Constant *F =
+      M->getOrInsertFunction("fgetc_unlocked", B.getInt32Ty(), File->getType());
+  if (File->getType()->isPointerTy())
+    inferLibFuncAttributes(*M->getFunction("fgetc_unlocked"), *TLI);
+  CallInst *CI = B.CreateCall(F, File, "fgetc_unlocked");
+
+  if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
+    CI->setCallingConv(Fn->getCallingConv());
+  return CI;
+}
+
+Value *llvm::emitFGetSUnlocked(Value *Str, Value *Size, Value *File,
+                               IRBuilder<> &B, const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_fgets_unlocked))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  Constant *F =
+      M->getOrInsertFunction("fgets_unlocked", B.getInt8PtrTy(),
+                             B.getInt8PtrTy(), B.getInt32Ty(), File->getType());
+  inferLibFuncAttributes(*M->getFunction("fgets_unlocked"), *TLI);
+  CallInst *CI =
+      B.CreateCall(F, {castToCStr(Str, B), Size, File}, "fgets_unlocked");
+
+  if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
+    CI->setCallingConv(Fn->getCallingConv());
+  return CI;
+}
+
+Value *llvm::emitFReadUnlocked(Value *Ptr, Value *Size, Value *N, Value *File,
+                               IRBuilder<> &B, const DataLayout &DL,
+                               const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc_fread_unlocked))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  StringRef FReadUnlockedName = TLI->getName(LibFunc_fread_unlocked);
+  Constant *F = M->getOrInsertFunction(
+      FReadUnlockedName, DL.getIntPtrType(Context), B.getInt8PtrTy(),
+      DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType());
+
+  if (File->getType()->isPointerTy())
+    inferLibFuncAttributes(*M->getFunction(FReadUnlockedName), *TLI);
+  CallInst *CI = B.CreateCall(F, {castToCStr(Ptr, B), Size, N, File});
+
+  if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
+    CI->setCallingConv(Fn->getCallingConv());
+  return CI;
+}
diff --git a/lib/Transforms/Utils/BypassSlowDivision.cpp b/lib/Transforms/Utils/BypassSlowDivision.cpp
index f711b192f604..05512a6dff3e 100644
--- a/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -34,7 +35,6 @@
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/KnownBits.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstdint>
 
@@ -173,7 +173,7 @@ Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) {
   return isDivisionOp() ? Value.Quotient : Value.Remainder;
 }
 
-/// \brief Check if a value looks like a hash.
+/// Check if a value looks like a hash.
 ///
 /// The routine is expected to detect values computed using the most common hash
 /// algorithms. Typically, hash computations end with one of the following
diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
index 972e47f9270a..c87b74f739f4 100644
--- a/lib/Transforms/Utils/CMakeLists.txt
+++ b/lib/Transforms/Utils/CMakeLists.txt
@@ -25,8 +25,10 @@ add_llvm_library(LLVMTransformUtils
   LCSSA.cpp
   LibCallsShrinkWrap.cpp
   Local.cpp
+  LoopRotationUtils.cpp
   LoopSimplify.cpp
   LoopUnroll.cpp
+  LoopUnrollAndJam.cpp
   LoopUnrollPeel.cpp
   LoopUnrollRuntime.cpp
   LoopUtils.cpp
@@ -43,10 +45,10 @@ add_llvm_library(LLVMTransformUtils
   PromoteMemoryToRegister.cpp
   StripGCRelocates.cpp
   SSAUpdater.cpp
+  SSAUpdaterBulk.cpp
   SanitizerStats.cpp
   SimplifyCFG.cpp
   SimplifyIndVar.cpp
-  SimplifyInstructions.cpp
   SimplifyLibCalls.cpp
   SplitModule.cpp
   StripNonLineTableDebugInfo.cpp
diff --git a/lib/Transforms/Utils/CallPromotionUtils.cpp b/lib/Transforms/Utils/CallPromotionUtils.cpp
index 8825f77555e7..4d9c22e57a68 100644
--- a/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -47,14 +47,11 @@ using namespace llvm;
 ///
 static void fixupPHINodeForNormalDest(InvokeInst *Invoke, BasicBlock *OrigBlock,
                                       BasicBlock *MergeBlock) {
-  for (auto &I : *Invoke->getNormalDest()) {
-    auto *Phi = dyn_cast<PHINode>(&I);
-    if (!Phi)
-      break;
-    int Idx = Phi->getBasicBlockIndex(OrigBlock);
+  for (PHINode &Phi : Invoke->getNormalDest()->phis()) {
+    int Idx = Phi.getBasicBlockIndex(OrigBlock);
     if (Idx == -1)
       continue;
-    Phi->setIncomingBlock(Idx, MergeBlock);
+    Phi.setIncomingBlock(Idx, MergeBlock);
   }
 }
 
@@ -82,16 +79,13 @@ static void fixupPHINodeForNormalDest(InvokeInst *Invoke, BasicBlock *OrigBlock,
 static void fixupPHINodeForUnwindDest(InvokeInst *Invoke, BasicBlock *OrigBlock,
                                       BasicBlock *ThenBlock,
                                       BasicBlock *ElseBlock) {
-  for (auto &I : *Invoke->getUnwindDest()) {
-    auto *Phi = dyn_cast<PHINode>(&I);
-    if (!Phi)
-      break;
-    int Idx = Phi->getBasicBlockIndex(OrigBlock);
+  for (PHINode &Phi : Invoke->getUnwindDest()->phis()) {
+    int Idx = Phi.getBasicBlockIndex(OrigBlock);
     if (Idx == -1)
       continue;
-    auto *V = Phi->getIncomingValue(Idx);
-    Phi->setIncomingBlock(Idx, ThenBlock);
-    Phi->addIncoming(V, ElseBlock);
+    auto *V = Phi.getIncomingValue(Idx);
+    Phi.setIncomingBlock(Idx, ThenBlock);
+    Phi.addIncoming(V, ElseBlock);
   }
 }
 
@@ -395,12 +389,14 @@ Instruction *llvm::promoteCall(CallSite CS, Function *Callee,
   // Inspect the arguments of the call site. If an argument's type doesn't
   // match the corresponding formal argument's type in the callee, bitcast it
   // to the correct type.
-  for (Use &U : CS.args()) {
-    unsigned ArgNo = CS.getArgumentNo(&U);
-    Type *FormalTy = Callee->getFunctionType()->getParamType(ArgNo);
-    Type *ActualTy = U.get()->getType();
+  auto CalleeType = Callee->getFunctionType();
+  auto CalleeParamNum = CalleeType->getNumParams();
+  for (unsigned ArgNo = 0; ArgNo < CalleeParamNum; ++ArgNo) {
+    auto *Arg = CS.getArgument(ArgNo); 
+    Type *FormalTy = CalleeType->getParamType(ArgNo);
+    Type *ActualTy = Arg->getType();
     if (FormalTy != ActualTy) {
-      auto *Cast = CastInst::Create(Instruction::BitCast, U.get(), FormalTy, "",
+      auto *Cast = CastInst::Create(Instruction::BitCast, Arg, FormalTy, "",
                                     CS.getInstruction());
       CS.setArgument(ArgNo, Cast);
     }
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index 3b19ba1b50f2..61448e9acb57 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
@@ -31,7 +32,6 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <map>
 using namespace llvm;
@@ -43,44 +43,36 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
                                   DebugInfoFinder *DIFinder) {
   DenseMap<const MDNode *, MDNode *> Cache;
   BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F);
-  if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix);
+  if (BB->hasName())
+    NewBB->setName(BB->getName() + NameSuffix);
 
   bool hasCalls = false, hasDynamicAllocas = false, hasStaticAllocas = false;
   Module *TheModule = F ? F->getParent() : nullptr;
 
   // Loop over all instructions, and copy them over.
-  for (BasicBlock::const_iterator II = BB->begin(), IE = BB->end();
-       II != IE; ++II) {
-
-    if (DIFinder && TheModule) {
-      if (auto *DDI = dyn_cast<DbgDeclareInst>(II))
-        DIFinder->processDeclare(*TheModule, DDI);
-      else if (auto *DVI = dyn_cast<DbgValueInst>(II))
-        DIFinder->processValue(*TheModule, DVI);
+  for (const Instruction &I : *BB) {
+    if (DIFinder && TheModule)
+      DIFinder->processInstruction(*TheModule, I);
 
-      if (auto DbgLoc = II->getDebugLoc())
-        DIFinder->processLocation(*TheModule, DbgLoc.get());
-    }
-
-    Instruction *NewInst = II->clone();
-    if (II->hasName())
-      NewInst->setName(II->getName()+NameSuffix);
+    Instruction *NewInst = I.clone();
+    if (I.hasName())
+      NewInst->setName(I.getName() + NameSuffix);
     NewBB->getInstList().push_back(NewInst);
-    VMap[&*II] = NewInst; // Add instruction map to value.
+    VMap[&I] = NewInst; // Add instruction map to value.
 
-    hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II));
-    if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
+    hasCalls |= (isa<CallInst>(I) && !isa<DbgInfoIntrinsic>(I));
+    if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
       if (isa<ConstantInt>(AI->getArraySize()))
         hasStaticAllocas = true;
       else
         hasDynamicAllocas = true;
     }
   }
-  
+
   if (CodeInfo) {
     CodeInfo->ContainsCalls          |= hasCalls;
     CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas;
-    CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas && 
+    CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas &&
                                         BB != &BB->getParent()->getEntryBlock();
   }
   return NewBB;
@@ -175,7 +167,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
 
     // Create a new basic block and copy instructions into it!
     BasicBlock *CBB = CloneBasicBlock(&BB, VMap, NameSuffix, NewFunc, CodeInfo,
-                                      SP ? &DIFinder : nullptr);
+                                      ModuleLevelChanges ? &DIFinder : nullptr);
 
     // Add basic block mapping.
     VMap[&BB] = CBB;
@@ -197,15 +189,15 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
       Returns.push_back(RI);
   }
 
-  for (DISubprogram *ISP : DIFinder.subprograms()) {
-    if (ISP != SP) {
+  for (DISubprogram *ISP : DIFinder.subprograms())
+    if (ISP != SP)
       VMap.MD()[ISP].reset(ISP);
-    }
-  }
 
-  for (auto *Type : DIFinder.types()) {
+  for (DICompileUnit *CU : DIFinder.compile_units())
+    VMap.MD()[CU].reset(CU);
+
+  for (DIType *Type : DIFinder.types())
     VMap.MD()[Type].reset(Type);
-  }
 
   // Loop over all of the instructions in the function, fixing up operand
   // references as we go.  This uses VMap to do all the hard work.
@@ -283,7 +275,7 @@ namespace {
 
     /// The specified block is found to be reachable, clone it and
     /// anything that it can reach.
-    void CloneBlock(const BasicBlock *BB, 
+    void CloneBlock(const BasicBlock *BB,
                     BasicBlock::const_iterator StartingInst,
                     std::vector<const BasicBlock*> &ToClone);
   };
@@ -493,17 +485,13 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
 
     // Handle PHI nodes specially, as we have to remove references to dead
     // blocks.
-    for (BasicBlock::const_iterator I = BI.begin(), E = BI.end(); I != E; ++I) {
+    for (const PHINode &PN : BI.phis()) {
       // PHI nodes may have been remapped to non-PHI nodes by the caller or
       // during the cloning process.
-      if (const PHINode *PN = dyn_cast<PHINode>(I)) {
-        if (isa<PHINode>(VMap[PN]))
-          PHIToResolve.push_back(PN);
-        else
-          break;
-      } else {
+      if (isa<PHINode>(VMap[&PN]))
+        PHIToResolve.push_back(&PN);
+      else
         break;
-      }
     }
 
     // Finally, remap the terminator instructions, as those can't be remapped
@@ -550,7 +538,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
     // phi nodes will have invalid entries.  Update the PHI nodes in this
     // case.
     PHINode *PN = cast<PHINode>(NewBB->begin());
-    NumPreds = std::distance(pred_begin(NewBB), pred_end(NewBB));
+    NumPreds = pred_size(NewBB);
     if (NumPreds != PN->getNumIncomingValues()) {
       assert(NumPreds < PN->getNumIncomingValues());
       // Count how many times each predecessor comes to this block.
@@ -722,7 +710,7 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
                             ModuleLevelChanges, Returns, NameSuffix, CodeInfo);
 }
 
-/// \brief Remaps instructions in \p Blocks using the mapping in \p VMap.
+/// Remaps instructions in \p Blocks using the mapping in \p VMap.
 void llvm::remapInstructionsInBlocks(
     const SmallVectorImpl<BasicBlock *> &Blocks, ValueToValueMapTy &VMap) {
   // Rewrite the code to refer to itself.
@@ -732,7 +720,7 @@ void llvm::remapInstructionsInBlocks(
                        RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
 }
 
-/// \brief Clones a loop \p OrigLoop.  Returns the loop and the blocks in \p
+/// Clones a loop \p OrigLoop.  Returns the loop and the blocks in \p
 /// Blocks.
 ///
 /// Updates LoopInfo and DominatorTree assuming the loop is dominated by block
@@ -796,12 +784,13 @@ Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,
   return NewLoop;
 }
 
-/// \brief Duplicate non-Phi instructions from the beginning of block up to
+/// Duplicate non-Phi instructions from the beginning of block up to
 /// StopAt instruction into a split block between BB and its predecessor.
 BasicBlock *
 llvm::DuplicateInstructionsInSplitBetween(BasicBlock *BB, BasicBlock *PredBB,
                                           Instruction *StopAt,
-                                          ValueToValueMapTy &ValueMapping) {
+                                          ValueToValueMapTy &ValueMapping,
+                                          DominatorTree *DT) {
   // We are going to have to map operands from the original BB block to the new
   // copy of the block 'NewBB'.  If there are PHI nodes in BB, evaluate them to
   // account for entry from PredBB.
@@ -809,13 +798,15 @@ llvm::DuplicateInstructionsInSplitBetween(BasicBlock *BB, BasicBlock *PredBB,
   for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
     ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
 
-  BasicBlock *NewBB = SplitEdge(PredBB, BB);
+  BasicBlock *NewBB = SplitEdge(PredBB, BB, DT);
   NewBB->setName(PredBB->getName() + ".split");
   Instruction *NewTerm = NewBB->getTerminator();
 
   // Clone the non-phi instructions of BB into NewBB, keeping track of the
   // mapping and using it to remap operands in the cloned instructions.
-  for (; StopAt != &*BI; ++BI) {
+  // Stop once we see the terminator too. This covers the case where BB's
+  // terminator gets replaced and StopAt == BB's terminator.
+  for (; StopAt != &*BI && BB->getTerminator() != &*BI; ++BI) {
     Instruction *New = BI->clone();
     New->setName(BI->getName());
     New->insertBefore(NewTerm);
diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp
index 8fee10854229..35c7511a24b9 100644
--- a/lib/Transforms/Utils/CloneModule.cpp
+++ b/lib/Transforms/Utils/CloneModule.cpp
@@ -32,33 +32,34 @@ static void copyComdat(GlobalObject *Dst, const GlobalObject *Src) {
 /// copies of global variables and functions, and making their (initializers and
 /// references, respectively) refer to the right globals.
 ///
-std::unique_ptr<Module> llvm::CloneModule(const Module *M) {
+std::unique_ptr<Module> llvm::CloneModule(const Module &M) {
   // Create the value map that maps things from the old module over to the new
   // module.
   ValueToValueMapTy VMap;
   return CloneModule(M, VMap);
 }
 
-std::unique_ptr<Module> llvm::CloneModule(const Module *M,
+std::unique_ptr<Module> llvm::CloneModule(const Module &M,
                                           ValueToValueMapTy &VMap) {
   return CloneModule(M, VMap, [](const GlobalValue *GV) { return true; });
 }
 
 std::unique_ptr<Module> llvm::CloneModule(
-    const Module *M, ValueToValueMapTy &VMap,
+    const Module &M, ValueToValueMapTy &VMap,
     function_ref<bool(const GlobalValue *)> ShouldCloneDefinition) {
   // First off, we need to create the new module.
   std::unique_ptr<Module> New =
-      llvm::make_unique<Module>(M->getModuleIdentifier(), M->getContext());
-  New->setDataLayout(M->getDataLayout());
-  New->setTargetTriple(M->getTargetTriple());
-  New->setModuleInlineAsm(M->getModuleInlineAsm());
-   
+      llvm::make_unique<Module>(M.getModuleIdentifier(), M.getContext());
+  New->setSourceFileName(M.getSourceFileName());
+  New->setDataLayout(M.getDataLayout());
+  New->setTargetTriple(M.getTargetTriple());
+  New->setModuleInlineAsm(M.getModuleInlineAsm());
+
   // Loop over all of the global variables, making corresponding globals in the
   // new module.  Here we add them to the VMap and to the new Module.  We
   // don't worry about attributes or initializers, they will come later.
   //
-  for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
        I != E; ++I) {
     GlobalVariable *GV = new GlobalVariable(*New, 
                                             I->getValueType(),
@@ -72,7 +73,7 @@ std::unique_ptr<Module> llvm::CloneModule(
   }
 
   // Loop over the functions in the module, making external functions as before
-  for (const Function &I : *M) {
+  for (const Function &I : M) {
     Function *NF = Function::Create(cast<FunctionType>(I.getValueType()),
                                     I.getLinkage(), I.getName(), New.get());
     NF->copyAttributesFrom(&I);
@@ -80,7 +81,7 @@ std::unique_ptr<Module> llvm::CloneModule(
   }
 
   // Loop over the aliases in the module
-  for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
+  for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end();
        I != E; ++I) {
     if (!ShouldCloneDefinition(&*I)) {
       // An alias cannot act as an external reference, so we need to create
@@ -114,7 +115,7 @@ std::unique_ptr<Module> llvm::CloneModule(
   // have been created, loop through and copy the global variable referrers
   // over...  We also set the attributes on the global now.
   //
-  for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
        I != E; ++I) {
     if (I->isDeclaration())
       continue;
@@ -139,7 +140,7 @@ std::unique_ptr<Module> llvm::CloneModule(
 
   // Similarly, copy over function bodies now...
   //
-  for (const Function &I : *M) {
+  for (const Function &I : M) {
     if (I.isDeclaration())
       continue;
 
@@ -169,7 +170,7 @@ std::unique_ptr<Module> llvm::CloneModule(
   }
 
   // And aliases
-  for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
+  for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end();
        I != E; ++I) {
     // We already dealt with undefined aliases above.
     if (!ShouldCloneDefinition(&*I))
@@ -180,8 +181,9 @@ std::unique_ptr<Module> llvm::CloneModule(
   }
 
   // And named metadata....
-  for (Module::const_named_metadata_iterator I = M->named_metadata_begin(),
-         E = M->named_metadata_end(); I != E; ++I) {
+  for (Module::const_named_metadata_iterator I = M.named_metadata_begin(),
+                                             E = M.named_metadata_end();
+       I != E; ++I) {
     const NamedMDNode &NMD = *I;
     NamedMDNode *NewNMD = New->getOrInsertNamedMetadata(NMD.getName());
     for (unsigned i = 0, e = NMD.getNumOperands(); i != e; ++i)
@@ -194,7 +196,7 @@ std::unique_ptr<Module> llvm::CloneModule(
 extern "C" {
 
 LLVMModuleRef LLVMCloneModule(LLVMModuleRef M) {
-  return wrap(CloneModule(unwrap(M)).release());
+  return wrap(CloneModule(*unwrap(M)).release());
 }
 
 }
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index 7a404241cb14..f31dab9f96af 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -66,6 +66,7 @@
 #include <vector>
 
 using namespace llvm;
+using ProfileCount = Function::ProfileCount;
 
 #define DEBUG_TYPE "code-extractor"
 
@@ -77,12 +78,10 @@ static cl::opt<bool>
 AggregateArgsOpt("aggregate-extracted-args", cl::Hidden,
                  cl::desc("Aggregate arguments to code-extracted functions"));
 
-/// \brief Test whether a block is valid for extraction.
-bool CodeExtractor::isBlockValidForExtraction(const BasicBlock &BB,
-                                              bool AllowVarArgs) {
-  // Landing pads must be in the function where they were inserted for cleanup.
-  if (BB.isEHPad())
-    return false;
+/// Test whether a block is valid for extraction.
+static bool isBlockValidForExtraction(const BasicBlock &BB,
+                                      const SetVector<BasicBlock *> &Result,
+                                      bool AllowVarArgs, bool AllowAlloca) {
   // taking the address of a basic block moved to another function is illegal
   if (BB.hasAddressTaken())
     return false;
@@ -111,11 +110,63 @@ bool CodeExtractor::isBlockValidForExtraction(const BasicBlock &BB,
     }
   }
 
-  // Don't hoist code containing allocas or invokes. If explicitly requested,
-  // allow vastart.
+  // If explicitly requested, allow vastart and alloca. For invoke instructions
+  // verify that extraction is valid.
   for (BasicBlock::const_iterator I = BB.begin(), E = BB.end(); I != E; ++I) {
-    if (isa<AllocaInst>(I) || isa<InvokeInst>(I))
-      return false;
+    if (isa<AllocaInst>(I)) {
+       if (!AllowAlloca)
+         return false;
+       continue;
+    }
+
+    if (const auto *II = dyn_cast<InvokeInst>(I)) {
+      // Unwind destination (either a landingpad, catchswitch, or cleanuppad)
+      // must be a part of the subgraph which is being extracted.
+      if (auto *UBB = II->getUnwindDest())
+        if (!Result.count(UBB))
+          return false;
+      continue;
+    }
+
+    // All catch handlers of a catchswitch instruction as well as the unwind
+    // destination must be in the subgraph.
+    if (const auto *CSI = dyn_cast<CatchSwitchInst>(I)) {
+      if (auto *UBB = CSI->getUnwindDest())
+        if (!Result.count(UBB))
+          return false;
+      for (auto *HBB : CSI->handlers())
+        if (!Result.count(const_cast<BasicBlock*>(HBB)))
+          return false;
+      continue;
+    }
+
+    // Make sure that entire catch handler is within subgraph. It is sufficient
+    // to check that catch return's block is in the list.
+    if (const auto *CPI = dyn_cast<CatchPadInst>(I)) {
+      for (const auto *U : CPI->users())
+        if (const auto *CRI = dyn_cast<CatchReturnInst>(U))
+          if (!Result.count(const_cast<BasicBlock*>(CRI->getParent())))
+            return false;
+      continue;
+    }
+
+    // And do similar checks for cleanup handler - the entire handler must be
+    // in subgraph which is going to be extracted. For cleanup return should
+    // additionally check that the unwind destination is also in the subgraph.
+    if (const auto *CPI = dyn_cast<CleanupPadInst>(I)) {
+      for (const auto *U : CPI->users())
+        if (const auto *CRI = dyn_cast<CleanupReturnInst>(U))
+          if (!Result.count(const_cast<BasicBlock*>(CRI->getParent())))
+            return false;
+      continue;
+    }
+    if (const auto *CRI = dyn_cast<CleanupReturnInst>(I)) {
+      if (auto *UBB = CRI->getUnwindDest())
+        if (!Result.count(UBB))
+          return false;
+      continue;
+    }
+
     if (const CallInst *CI = dyn_cast<CallInst>(I))
       if (const Function *F = CI->getCalledFunction())
         if (F->getIntrinsicID() == Intrinsic::vastart) {
@@ -129,10 +180,10 @@ bool CodeExtractor::isBlockValidForExtraction(const BasicBlock &BB,
   return true;
 }
 
-/// \brief Build a set of blocks to extract if the input blocks are viable.
+/// Build a set of blocks to extract if the input blocks are viable.
 static SetVector<BasicBlock *>
 buildExtractionBlockSet(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
-                        bool AllowVarArgs) {
+                        bool AllowVarArgs, bool AllowAlloca) {
   assert(!BBs.empty() && "The set of blocks to extract must be non-empty");
   SetVector<BasicBlock *> Result;
 
@@ -145,32 +196,42 @@ buildExtractionBlockSet(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
 
     if (!Result.insert(BB))
       llvm_unreachable("Repeated basic blocks in extraction input");
-    if (!CodeExtractor::isBlockValidForExtraction(*BB, AllowVarArgs)) {
-      Result.clear();
-      return Result;
-    }
   }
 
-#ifndef NDEBUG
-  for (SetVector<BasicBlock *>::iterator I = std::next(Result.begin()),
-                                         E = Result.end();
-       I != E; ++I)
-    for (pred_iterator PI = pred_begin(*I), PE = pred_end(*I);
-         PI != PE; ++PI)
-      assert(Result.count(*PI) &&
-             "No blocks in this region may have entries from outside the region"
-             " except for the first block!");
-#endif
+  for (auto *BB : Result) {
+    if (!isBlockValidForExtraction(*BB, Result, AllowVarArgs, AllowAlloca))
+      return {};
+
+    // Make sure that the first block is not a landing pad.
+    if (BB == Result.front()) {
+      if (BB->isEHPad()) {
+        LLVM_DEBUG(dbgs() << "The first block cannot be an unwind block\n");
+        return {};
+      }
+      continue;
+    }
+
+    // All blocks other than the first must not have predecessors outside of
+    // the subgraph which is being extracted.
+    for (auto *PBB : predecessors(BB))
+      if (!Result.count(PBB)) {
+        LLVM_DEBUG(
+            dbgs() << "No blocks in this region may have entries from "
+                      "outside the region except for the first block!\n");
+        return {};
+      }
+  }
 
   return Result;
 }
 
 CodeExtractor::CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
                              bool AggregateArgs, BlockFrequencyInfo *BFI,
-                             BranchProbabilityInfo *BPI, bool AllowVarArgs)
+                             BranchProbabilityInfo *BPI, bool AllowVarArgs,
+                             bool AllowAlloca)
     : DT(DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
       BPI(BPI), AllowVarArgs(AllowVarArgs),
-      Blocks(buildExtractionBlockSet(BBs, DT, AllowVarArgs)) {}
+      Blocks(buildExtractionBlockSet(BBs, DT, AllowVarArgs, AllowAlloca)) {}
 
 CodeExtractor::CodeExtractor(DominatorTree &DT, Loop &L, bool AggregateArgs,
                              BlockFrequencyInfo *BFI,
@@ -178,7 +239,8 @@ CodeExtractor::CodeExtractor(DominatorTree &DT, Loop &L, bool AggregateArgs,
     : DT(&DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
       BPI(BPI), AllowVarArgs(false),
       Blocks(buildExtractionBlockSet(L.getBlocks(), &DT,
-                                     /* AllowVarArgs */ false)) {}
+                                     /* AllowVarArgs */ false,
+                                     /* AllowAlloca */ false)) {}
 
 /// definedInRegion - Return true if the specified value is defined in the
 /// extracted region.
@@ -562,8 +624,8 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
                                            BasicBlock *newHeader,
                                            Function *oldFunction,
                                            Module *M) {
-  DEBUG(dbgs() << "inputs: " << inputs.size() << "\n");
-  DEBUG(dbgs() << "outputs: " << outputs.size() << "\n");
+  LLVM_DEBUG(dbgs() << "inputs: " << inputs.size() << "\n");
+  LLVM_DEBUG(dbgs() << "outputs: " << outputs.size() << "\n");
 
   // This function returns unsigned, outputs will go back by reference.
   switch (NumExitBlocks) {
@@ -577,20 +639,20 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
 
   // Add the types of the input values to the function's argument list
   for (Value *value : inputs) {
-    DEBUG(dbgs() << "value used in func: " << *value << "\n");
+    LLVM_DEBUG(dbgs() << "value used in func: " << *value << "\n");
     paramTy.push_back(value->getType());
   }
 
   // Add the types of the output values to the function's argument list.
   for (Value *output : outputs) {
-    DEBUG(dbgs() << "instr used in func: " << *output << "\n");
+    LLVM_DEBUG(dbgs() << "instr used in func: " << *output << "\n");
     if (AggregateArgs)
       paramTy.push_back(output->getType());
     else
       paramTy.push_back(PointerType::getUnqual(output->getType()));
   }
 
-  DEBUG({
+  LLVM_DEBUG({
     dbgs() << "Function type: " << *RetTy << " f(";
     for (Type *i : paramTy)
       dbgs() << *i << ", ";
@@ -620,16 +682,89 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
   if (oldFunction->hasUWTable())
     newFunction->setHasUWTable();
 
-  // Inherit all of the target dependent attributes.
+  // Inherit all of the target dependent attributes and white-listed
+  // target independent attributes.
   //  (e.g. If the extracted region contains a call to an x86.sse
   //  instruction we need to make sure that the extracted region has the
   //  "target-features" attribute allowing it to be lowered.
   // FIXME: This should be changed to check to see if a specific
   //           attribute can not be inherited.
-  AttrBuilder AB(oldFunction->getAttributes().getFnAttributes());
-  for (const auto &Attr : AB.td_attrs())
-    newFunction->addFnAttr(Attr.first, Attr.second);
+  for (const auto &Attr : oldFunction->getAttributes().getFnAttributes()) {
+    if (Attr.isStringAttribute()) {
+      if (Attr.getKindAsString() == "thunk")
+        continue;
+    } else
+      switch (Attr.getKindAsEnum()) {
+      // Those attributes cannot be propagated safely. Explicitly list them
+      // here so we get a warning if new attributes are added. This list also
+      // includes non-function attributes.
+      case Attribute::Alignment:
+      case Attribute::AllocSize:
+      case Attribute::ArgMemOnly:
+      case Attribute::Builtin:
+      case Attribute::ByVal:
+      case Attribute::Convergent:
+      case Attribute::Dereferenceable:
+      case Attribute::DereferenceableOrNull:
+      case Attribute::InAlloca:
+      case Attribute::InReg:
+      case Attribute::InaccessibleMemOnly:
+      case Attribute::InaccessibleMemOrArgMemOnly:
+      case Attribute::JumpTable:
+      case Attribute::Naked:
+      case Attribute::Nest:
+      case Attribute::NoAlias:
+      case Attribute::NoBuiltin:
+      case Attribute::NoCapture:
+      case Attribute::NoReturn:
+      case Attribute::None:
+      case Attribute::NonNull:
+      case Attribute::ReadNone:
+      case Attribute::ReadOnly:
+      case Attribute::Returned:
+      case Attribute::ReturnsTwice:
+      case Attribute::SExt:
+      case Attribute::Speculatable:
+      case Attribute::StackAlignment:
+      case Attribute::StructRet:
+      case Attribute::SwiftError:
+      case Attribute::SwiftSelf:
+      case Attribute::WriteOnly:
+      case Attribute::ZExt:
+      case Attribute::EndAttrKinds:
+        continue;
+      // Those attributes should be safe to propagate to the extracted function.
+      case Attribute::AlwaysInline:
+      case Attribute::Cold:
+      case Attribute::NoRecurse:
+      case Attribute::InlineHint:
+      case Attribute::MinSize:
+      case Attribute::NoDuplicate:
+      case Attribute::NoImplicitFloat:
+      case Attribute::NoInline:
+      case Attribute::NonLazyBind:
+      case Attribute::NoRedZone:
+      case Attribute::NoUnwind:
+      case Attribute::OptForFuzzing:
+      case Attribute::OptimizeNone:
+      case Attribute::OptimizeForSize:
+      case Attribute::SafeStack:
+      case Attribute::ShadowCallStack:
+      case Attribute::SanitizeAddress:
+      case Attribute::SanitizeMemory:
+      case Attribute::SanitizeThread:
+      case Attribute::SanitizeHWAddress:
+      case Attribute::StackProtect:
+      case Attribute::StackProtectReq:
+      case Attribute::StackProtectStrong:
+      case Attribute::StrictFP:
+      case Attribute::UWTable:
+      case Attribute::NoCfCheck:
+        break;
+      }
 
+    newFunction->addFnAttr(Attr);
+  }
   newFunction->getBasicBlockList().push_back(newRootNode);
 
   // Create an iterator to name all of the arguments we inserted.
@@ -1093,10 +1228,10 @@ Function *CodeExtractor::extractCodeRegion() {
 
   // Update the entry count of the function.
   if (BFI) {
-    Optional<uint64_t> EntryCount =
-        BFI->getProfileCountFromFreq(EntryFreq.getFrequency());
-    if (EntryCount.hasValue())
-      newFunction->setEntryCount(EntryCount.getValue());
+    auto Count = BFI->getProfileCountFromFreq(EntryFreq.getFrequency());
+    if (Count.hasValue())
+      newFunction->setEntryCount(
+          ProfileCount(Count.getValue(), Function::PCT_Real)); // FIXME
     BFI->setBlockFreq(codeReplacer, EntryFreq.getFrequency());
   }
 
@@ -1104,6 +1239,10 @@ Function *CodeExtractor::extractCodeRegion() {
 
   moveCodeToFunction(newFunction);
 
+  // Propagate personality info to the new function if there is one.
+  if (oldFunction->hasPersonalityFn())
+    newFunction->setPersonalityFn(oldFunction->getPersonalityFn());
+
   // Update the branch weights for the exit block.
   if (BFI && NumExitBlocks > 1)
     calculateNewCallTerminatorWeights(codeReplacer, ExitWeights, BPI);
@@ -1139,7 +1278,7 @@ Function *CodeExtractor::extractCodeRegion() {
         }
     }
 
-  DEBUG(if (verifyFunction(*newFunction)) 
-        report_fatal_error("verifyFunction failed!"));
+  LLVM_DEBUG(if (verifyFunction(*newFunction))
+                 report_fatal_error("verifyFunction failed!"));
   return newFunction;
 }
diff --git a/lib/Transforms/Utils/CtorUtils.cpp b/lib/Transforms/Utils/CtorUtils.cpp
index 82b67c293102..9a0240144d08 100644
--- a/lib/Transforms/Utils/CtorUtils.cpp
+++ b/lib/Transforms/Utils/CtorUtils.cpp
@@ -138,7 +138,7 @@ bool optimizeGlobalCtorsList(Module &M,
     if (!F)
       continue;
 
-    DEBUG(dbgs() << "Optimizing Global Constructor: " << *F << "\n");
+    LLVM_DEBUG(dbgs() << "Optimizing Global Constructor: " << *F << "\n");
 
     // We cannot simplify external ctor functions.
     if (F->empty())
diff --git a/lib/Transforms/Utils/DemoteRegToStack.cpp b/lib/Transforms/Utils/DemoteRegToStack.cpp
index 6d3d287defdb..56ff03c7f5e1 100644
--- a/lib/Transforms/Utils/DemoteRegToStack.cpp
+++ b/lib/Transforms/Utils/DemoteRegToStack.cpp
@@ -9,11 +9,11 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Analysis/CFG.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 /// DemoteRegToStack - This function takes a virtual register computed by an
diff --git a/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/lib/Transforms/Utils/EntryExitInstrumenter.cpp
index 421663f82565..569ea58a3047 100644
--- a/lib/Transforms/Utils/EntryExitInstrumenter.cpp
+++ b/lib/Transforms/Utils/EntryExitInstrumenter.cpp
@@ -9,14 +9,13 @@
 
 #include "llvm/Transforms/Utils/EntryExitInstrumenter.h"
 #include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 using namespace llvm;
 
 static void insertCall(Function &CurFn, StringRef Func,
@@ -92,17 +91,27 @@ static bool runOnFunction(Function &F, bool PostInlining) {
 
   if (!ExitFunc.empty()) {
     for (BasicBlock &BB : F) {
-      TerminatorInst *T = BB.getTerminator();
+      Instruction *T = BB.getTerminator();
+      if (!isa<ReturnInst>(T))
+        continue;
+
+      // If T is preceded by a musttail call, that's the real terminator.
+      Instruction *Prev = T->getPrevNode();
+      if (BitCastInst *BCI = dyn_cast_or_null<BitCastInst>(Prev))
+        Prev = BCI->getPrevNode();
+      if (CallInst *CI = dyn_cast_or_null<CallInst>(Prev)) {
+        if (CI->isMustTailCall())
+          T = CI;
+      }
+
       DebugLoc DL;
       if (DebugLoc TerminatorDL = T->getDebugLoc())
         DL = TerminatorDL;
       else if (auto SP = F.getSubprogram())
         DL = DebugLoc::get(0, 0, SP);
 
-      if (isa<ReturnInst>(T)) {
-        insertCall(F, ExitFunc, T, DL);
-        Changed = true;
-      }
+      insertCall(F, ExitFunc, T, DL);
+      Changed = true;
     }
     F.removeAttribute(AttributeList::FunctionIndex, ExitAttr);
   }
diff --git a/lib/Transforms/Utils/EscapeEnumerator.cpp b/lib/Transforms/Utils/EscapeEnumerator.cpp
index 78d7474e5b95..c9c96fbe5da0 100644
--- a/lib/Transforms/Utils/EscapeEnumerator.cpp
+++ b/lib/Transforms/Utils/EscapeEnumerator.cpp
@@ -14,9 +14,9 @@
 
 #include "llvm/Transforms/Utils/EscapeEnumerator.h"
 #include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 static Constant *getDefaultPersonalityFn(Module *M) {
@@ -73,8 +73,8 @@ IRBuilder<> *EscapeEnumerator::Next() {
     F.setPersonalityFn(PersFn);
   }
 
-  if (isFuncletEHPersonality(classifyEHPersonality(F.getPersonalityFn()))) {
-    report_fatal_error("Funclet EH not supported");
+  if (isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn()))) {
+    report_fatal_error("Scoped EH not supported");
   }
 
   LandingPadInst *LPad =
diff --git a/lib/Transforms/Utils/Evaluator.cpp b/lib/Transforms/Utils/Evaluator.cpp
index 3c5e299fae98..7fd9425efed3 100644
--- a/lib/Transforms/Utils/Evaluator.cpp
+++ b/lib/Transforms/Utils/Evaluator.cpp
@@ -24,6 +24,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/InstrTypes.h"
@@ -174,6 +175,11 @@ static bool isSimpleEnoughPointerToCommit(Constant *C) {
   return false;
 }
 
+static Constant *getInitializer(Constant *C) {
+  auto *GV = dyn_cast<GlobalVariable>(C);
+  return GV && GV->hasDefinitiveInitializer() ? GV->getInitializer() : nullptr;
+}
+
 /// Return the value that would be computed by a load from P after the stores
 /// reflected by 'memory' have been performed.  If we can't decide, return null.
 Constant *Evaluator::ComputeLoadResult(Constant *P) {
@@ -189,18 +195,96 @@ Constant *Evaluator::ComputeLoadResult(Constant *P) {
     return nullptr;
   }
 
-  // Handle a constantexpr getelementptr.
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(P))
-    if (CE->getOpcode() == Instruction::GetElementPtr &&
-        isa<GlobalVariable>(CE->getOperand(0))) {
-      GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
-      if (GV->hasDefinitiveInitializer())
-        return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE);
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(P)) {
+    switch (CE->getOpcode()) {
+    // Handle a constantexpr getelementptr.
+    case Instruction::GetElementPtr:
+      if (auto *I = getInitializer(CE->getOperand(0)))
+        return ConstantFoldLoadThroughGEPConstantExpr(I, CE);
+      break;
+    // Handle a constantexpr bitcast.
+    case Instruction::BitCast:
+      Constant *Val = getVal(CE->getOperand(0));
+      auto MM = MutatedMemory.find(Val);
+      auto *I = (MM != MutatedMemory.end()) ? MM->second
+                                            : getInitializer(CE->getOperand(0));
+      if (I)
+        return ConstantFoldLoadThroughBitcast(
+            I, P->getType()->getPointerElementType(), DL);
+      break;
     }
+  }
 
   return nullptr;  // don't know how to evaluate.
 }
 
+static Function *getFunction(Constant *C) {
+  if (auto *Fn = dyn_cast<Function>(C))
+    return Fn;
+
+  if (auto *Alias = dyn_cast<GlobalAlias>(C))
+    if (auto *Fn = dyn_cast<Function>(Alias->getAliasee()))
+      return Fn;
+  return nullptr;
+}
+
+Function *
+Evaluator::getCalleeWithFormalArgs(CallSite &CS,
+                                   SmallVector<Constant *, 8> &Formals) {
+  auto *V = CS.getCalledValue();
+  if (auto *Fn = getFunction(getVal(V)))
+    return getFormalParams(CS, Fn, Formals) ? Fn : nullptr;
+
+  auto *CE = dyn_cast<ConstantExpr>(V);
+  if (!CE || CE->getOpcode() != Instruction::BitCast ||
+      !getFormalParams(CS, getFunction(CE->getOperand(0)), Formals))
+    return nullptr;
+
+  return dyn_cast<Function>(
+      ConstantFoldLoadThroughBitcast(CE, CE->getOperand(0)->getType(), DL));
+}
+
+bool Evaluator::getFormalParams(CallSite &CS, Function *F,
+                                SmallVector<Constant *, 8> &Formals) {
+  if (!F)
+    return false;
+
+  auto *FTy = F->getFunctionType();
+  if (FTy->getNumParams() > CS.getNumArgOperands()) {
+    LLVM_DEBUG(dbgs() << "Too few arguments for function.\n");
+    return false;
+  }
+
+  auto ArgI = CS.arg_begin();
+  for (auto ParI = FTy->param_begin(), ParE = FTy->param_end(); ParI != ParE;
+       ++ParI) {
+    auto *ArgC = ConstantFoldLoadThroughBitcast(getVal(*ArgI), *ParI, DL);
+    if (!ArgC) {
+      LLVM_DEBUG(dbgs() << "Can not convert function argument.\n");
+      return false;
+    }
+    Formals.push_back(ArgC);
+    ++ArgI;
+  }
+  return true;
+}
+
+/// If call expression contains bitcast then we may need to cast
+/// evaluated return value to a type of the call expression.
+Constant *Evaluator::castCallResultIfNeeded(Value *CallExpr, Constant *RV) {
+  ConstantExpr *CE = dyn_cast<ConstantExpr>(CallExpr);
+  if (!RV || !CE || CE->getOpcode() != Instruction::BitCast)
+    return RV;
+
+  if (auto *FT =
+          dyn_cast<FunctionType>(CE->getType()->getPointerElementType())) {
+    RV = ConstantFoldLoadThroughBitcast(RV, FT->getReturnType(), DL);
+    if (!RV)
+      LLVM_DEBUG(dbgs() << "Failed to fold bitcast call expr\n");
+  }
+  return RV;
+}
+
 /// Evaluate all instructions in block BB, returning true if successful, false
 /// if we can't evaluate it.  NewBB returns the next BB that control flows into,
 /// or null upon return.
@@ -210,22 +294,23 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
   while (true) {
     Constant *InstResult = nullptr;
 
-    DEBUG(dbgs() << "Evaluating Instruction: " << *CurInst << "\n");
+    LLVM_DEBUG(dbgs() << "Evaluating Instruction: " << *CurInst << "\n");
 
     if (StoreInst *SI = dyn_cast<StoreInst>(CurInst)) {
       if (!SI->isSimple()) {
-        DEBUG(dbgs() << "Store is not simple! Can not evaluate.\n");
+        LLVM_DEBUG(dbgs() << "Store is not simple! Can not evaluate.\n");
         return false;  // no volatile/atomic accesses.
       }
       Constant *Ptr = getVal(SI->getOperand(1));
       if (auto *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI)) {
-        DEBUG(dbgs() << "Folding constant ptr expression: " << *Ptr);
+        LLVM_DEBUG(dbgs() << "Folding constant ptr expression: " << *Ptr);
         Ptr = FoldedPtr;
-        DEBUG(dbgs() << "; To: " << *Ptr << "\n");
+        LLVM_DEBUG(dbgs() << "; To: " << *Ptr << "\n");
       }
       if (!isSimpleEnoughPointerToCommit(Ptr)) {
         // If this is too complex for us to commit, reject it.
-        DEBUG(dbgs() << "Pointer is too complex for us to evaluate store.");
+        LLVM_DEBUG(
+            dbgs() << "Pointer is too complex for us to evaluate store.");
         return false;
       }
 
@@ -234,14 +319,15 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
       // If this might be too difficult for the backend to handle (e.g. the addr
       // of one global variable divided by another) then we can't commit it.
       if (!isSimpleEnoughValueToCommit(Val, SimpleConstants, DL)) {
-        DEBUG(dbgs() << "Store value is too complex to evaluate store. " << *Val
-              << "\n");
+        LLVM_DEBUG(dbgs() << "Store value is too complex to evaluate store. "
+                          << *Val << "\n");
         return false;
       }
 
       if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
         if (CE->getOpcode() == Instruction::BitCast) {
-          DEBUG(dbgs() << "Attempting to resolve bitcast on constant ptr.\n");
+          LLVM_DEBUG(dbgs()
+                     << "Attempting to resolve bitcast on constant ptr.\n");
           // If we're evaluating a store through a bitcast, then we need
           // to pull the bitcast off the pointer type and push it onto the
           // stored value.
@@ -252,7 +338,8 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
           // In order to push the bitcast onto the stored value, a bitcast
           // from NewTy to Val's type must be legal.  If it's not, we can try
           // introspecting NewTy to find a legal conversion.
-          while (!Val->getType()->canLosslesslyBitCastTo(NewTy)) {
+          Constant *NewVal;
+          while (!(NewVal = ConstantFoldLoadThroughBitcast(Val, NewTy, DL))) {
             // If NewTy is a struct, we can convert the pointer to the struct
             // into a pointer to its first member.
             // FIXME: This could be extended to support arrays as well.
@@ -270,17 +357,14 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
             // If we can't improve the situation by introspecting NewTy,
             // we have to give up.
             } else {
-              DEBUG(dbgs() << "Failed to bitcast constant ptr, can not "
-                    "evaluate.\n");
+              LLVM_DEBUG(dbgs() << "Failed to bitcast constant ptr, can not "
+                                   "evaluate.\n");
               return false;
             }
           }
 
-          // If we found compatible types, go ahead and push the bitcast
-          // onto the stored value.
-          Val = ConstantExpr::getBitCast(Val, NewTy);
-
-          DEBUG(dbgs() << "Evaluated bitcast: " << *Val << "\n");
+          Val = NewVal;
+          LLVM_DEBUG(dbgs() << "Evaluated bitcast: " << *Val << "\n");
         }
       }
 
@@ -289,37 +373,37 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
       InstResult = ConstantExpr::get(BO->getOpcode(),
                                      getVal(BO->getOperand(0)),
                                      getVal(BO->getOperand(1)));
-      DEBUG(dbgs() << "Found a BinaryOperator! Simplifying: " << *InstResult
-            << "\n");
+      LLVM_DEBUG(dbgs() << "Found a BinaryOperator! Simplifying: "
+                        << *InstResult << "\n");
     } else if (CmpInst *CI = dyn_cast<CmpInst>(CurInst)) {
       InstResult = ConstantExpr::getCompare(CI->getPredicate(),
                                             getVal(CI->getOperand(0)),
                                             getVal(CI->getOperand(1)));
-      DEBUG(dbgs() << "Found a CmpInst! Simplifying: " << *InstResult
-            << "\n");
+      LLVM_DEBUG(dbgs() << "Found a CmpInst! Simplifying: " << *InstResult
+                        << "\n");
     } else if (CastInst *CI = dyn_cast<CastInst>(CurInst)) {
       InstResult = ConstantExpr::getCast(CI->getOpcode(),
                                          getVal(CI->getOperand(0)),
                                          CI->getType());
-      DEBUG(dbgs() << "Found a Cast! Simplifying: " << *InstResult
-            << "\n");
+      LLVM_DEBUG(dbgs() << "Found a Cast! Simplifying: " << *InstResult
+                        << "\n");
     } else if (SelectInst *SI = dyn_cast<SelectInst>(CurInst)) {
       InstResult = ConstantExpr::getSelect(getVal(SI->getOperand(0)),
                                            getVal(SI->getOperand(1)),
                                            getVal(SI->getOperand(2)));
-      DEBUG(dbgs() << "Found a Select! Simplifying: " << *InstResult
-            << "\n");
+      LLVM_DEBUG(dbgs() << "Found a Select! Simplifying: " << *InstResult
+                        << "\n");
     } else if (auto *EVI = dyn_cast<ExtractValueInst>(CurInst)) {
       InstResult = ConstantExpr::getExtractValue(
           getVal(EVI->getAggregateOperand()), EVI->getIndices());
-      DEBUG(dbgs() << "Found an ExtractValueInst! Simplifying: " << *InstResult
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "Found an ExtractValueInst! Simplifying: "
+                        << *InstResult << "\n");
     } else if (auto *IVI = dyn_cast<InsertValueInst>(CurInst)) {
       InstResult = ConstantExpr::getInsertValue(
           getVal(IVI->getAggregateOperand()),
           getVal(IVI->getInsertedValueOperand()), IVI->getIndices());
-      DEBUG(dbgs() << "Found an InsertValueInst! Simplifying: " << *InstResult
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "Found an InsertValueInst! Simplifying: "
+                        << *InstResult << "\n");
     } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurInst)) {
       Constant *P = getVal(GEP->getOperand(0));
       SmallVector<Constant*, 8> GEPOps;
@@ -329,60 +413,63 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
       InstResult =
           ConstantExpr::getGetElementPtr(GEP->getSourceElementType(), P, GEPOps,
                                          cast<GEPOperator>(GEP)->isInBounds());
-      DEBUG(dbgs() << "Found a GEP! Simplifying: " << *InstResult
-            << "\n");
+      LLVM_DEBUG(dbgs() << "Found a GEP! Simplifying: " << *InstResult << "\n");
     } else if (LoadInst *LI = dyn_cast<LoadInst>(CurInst)) {
       if (!LI->isSimple()) {
-        DEBUG(dbgs() << "Found a Load! Not a simple load, can not evaluate.\n");
+        LLVM_DEBUG(
+            dbgs() << "Found a Load! Not a simple load, can not evaluate.\n");
         return false;  // no volatile/atomic accesses.
       }
 
       Constant *Ptr = getVal(LI->getOperand(0));
       if (auto *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI)) {
         Ptr = FoldedPtr;
-        DEBUG(dbgs() << "Found a constant pointer expression, constant "
-              "folding: " << *Ptr << "\n");
+        LLVM_DEBUG(dbgs() << "Found a constant pointer expression, constant "
+                             "folding: "
+                          << *Ptr << "\n");
       }
       InstResult = ComputeLoadResult(Ptr);
       if (!InstResult) {
-        DEBUG(dbgs() << "Failed to compute load result. Can not evaluate load."
-              "\n");
+        LLVM_DEBUG(
+            dbgs() << "Failed to compute load result. Can not evaluate load."
+                      "\n");
         return false; // Could not evaluate load.
       }
 
-      DEBUG(dbgs() << "Evaluated load: " << *InstResult << "\n");
+      LLVM_DEBUG(dbgs() << "Evaluated load: " << *InstResult << "\n");
     } else if (AllocaInst *AI = dyn_cast<AllocaInst>(CurInst)) {
       if (AI->isArrayAllocation()) {
-        DEBUG(dbgs() << "Found an array alloca. Can not evaluate.\n");
+        LLVM_DEBUG(dbgs() << "Found an array alloca. Can not evaluate.\n");
         return false;  // Cannot handle array allocs.
       }
       Type *Ty = AI->getAllocatedType();
       AllocaTmps.push_back(llvm::make_unique<GlobalVariable>(
           Ty, false, GlobalValue::InternalLinkage, UndefValue::get(Ty),
-          AI->getName()));
+          AI->getName(), /*TLMode=*/GlobalValue::NotThreadLocal,
+          AI->getType()->getPointerAddressSpace()));
       InstResult = AllocaTmps.back().get();
-      DEBUG(dbgs() << "Found an alloca. Result: " << *InstResult << "\n");
+      LLVM_DEBUG(dbgs() << "Found an alloca. Result: " << *InstResult << "\n");
     } else if (isa<CallInst>(CurInst) || isa<InvokeInst>(CurInst)) {
       CallSite CS(&*CurInst);
 
       // Debug info can safely be ignored here.
       if (isa<DbgInfoIntrinsic>(CS.getInstruction())) {
-        DEBUG(dbgs() << "Ignoring debug info.\n");
+        LLVM_DEBUG(dbgs() << "Ignoring debug info.\n");
         ++CurInst;
         continue;
       }
 
       // Cannot handle inline asm.
       if (isa<InlineAsm>(CS.getCalledValue())) {
-        DEBUG(dbgs() << "Found inline asm, can not evaluate.\n");
+        LLVM_DEBUG(dbgs() << "Found inline asm, can not evaluate.\n");
         return false;
       }
 
       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
         if (MemSetInst *MSI = dyn_cast<MemSetInst>(II)) {
           if (MSI->isVolatile()) {
-            DEBUG(dbgs() << "Can not optimize a volatile memset " <<
-                  "intrinsic.\n");
+            LLVM_DEBUG(dbgs() << "Can not optimize a volatile memset "
+                              << "intrinsic.\n");
             return false;
           }
           Constant *Ptr = getVal(MSI->getDest());
@@ -390,7 +477,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
           Constant *DestVal = ComputeLoadResult(getVal(Ptr));
           if (Val->isNullValue() && DestVal && DestVal->isNullValue()) {
             // This memset is a no-op.
-            DEBUG(dbgs() << "Ignoring no-op memset.\n");
+            LLVM_DEBUG(dbgs() << "Ignoring no-op memset.\n");
             ++CurInst;
             continue;
           }
@@ -398,7 +485,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
 
         if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
             II->getIntrinsicID() == Intrinsic::lifetime_end) {
-          DEBUG(dbgs() << "Ignoring lifetime intrinsic.\n");
+          LLVM_DEBUG(dbgs() << "Ignoring lifetime intrinsic.\n");
           ++CurInst;
           continue;
         }
@@ -407,7 +494,8 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
           // We don't insert an entry into Values, as it doesn't have a
           // meaningful return value.
           if (!II->use_empty()) {
-            DEBUG(dbgs() << "Found unused invariant_start. Can't evaluate.\n");
+            LLVM_DEBUG(dbgs()
+                       << "Found unused invariant_start. Can't evaluate.\n");
             return false;
           }
           ConstantInt *Size = cast<ConstantInt>(II->getArgOperand(0));
@@ -419,54 +507,54 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
                 Size->getValue().getLimitedValue() >=
                     DL.getTypeStoreSize(ElemTy)) {
               Invariants.insert(GV);
-              DEBUG(dbgs() << "Found a global var that is an invariant: " << *GV
-                    << "\n");
+              LLVM_DEBUG(dbgs() << "Found a global var that is an invariant: "
+                                << *GV << "\n");
             } else {
-              DEBUG(dbgs() << "Found a global var, but can not treat it as an "
-                    "invariant.\n");
+              LLVM_DEBUG(dbgs()
+                         << "Found a global var, but can not treat it as an "
+                            "invariant.\n");
             }
           }
           // Continue even if we do nothing.
           ++CurInst;
           continue;
         } else if (II->getIntrinsicID() == Intrinsic::assume) {
-          DEBUG(dbgs() << "Skipping assume intrinsic.\n");
+          LLVM_DEBUG(dbgs() << "Skipping assume intrinsic.\n");
           ++CurInst;
           continue;
         } else if (II->getIntrinsicID() == Intrinsic::sideeffect) {
-          DEBUG(dbgs() << "Skipping sideeffect intrinsic.\n");
+          LLVM_DEBUG(dbgs() << "Skipping sideeffect intrinsic.\n");
           ++CurInst;
           continue;
         }
 
-        DEBUG(dbgs() << "Unknown intrinsic. Can not evaluate.\n");
+        LLVM_DEBUG(dbgs() << "Unknown intrinsic. Can not evaluate.\n");
         return false;
       }
 
       // Resolve function pointers.
-      Function *Callee = dyn_cast<Function>(getVal(CS.getCalledValue()));
+      SmallVector<Constant *, 8> Formals;
+      Function *Callee = getCalleeWithFormalArgs(CS, Formals);
       if (!Callee || Callee->isInterposable()) {
-        DEBUG(dbgs() << "Can not resolve function pointer.\n");
+        LLVM_DEBUG(dbgs() << "Can not resolve function pointer.\n");
         return false;  // Cannot resolve.
       }
 
-      SmallVector<Constant*, 8> Formals;
-      for (User::op_iterator i = CS.arg_begin(), e = CS.arg_end(); i != e; ++i)
-        Formals.push_back(getVal(*i));
-
       if (Callee->isDeclaration()) {
         // If this is a function we can constant fold, do it.
         if (Constant *C = ConstantFoldCall(CS, Callee, Formals, TLI)) {
-          InstResult = C;
-          DEBUG(dbgs() << "Constant folded function call. Result: " <<
-                *InstResult << "\n");
+          InstResult = castCallResultIfNeeded(CS.getCalledValue(), C);
+          if (!InstResult)
+            return false;
+          LLVM_DEBUG(dbgs() << "Constant folded function call. Result: "
+                            << *InstResult << "\n");
         } else {
-          DEBUG(dbgs() << "Can not constant fold function call.\n");
+          LLVM_DEBUG(dbgs() << "Can not constant fold function call.\n");
           return false;
         }
       } else {
         if (Callee->getFunctionType()->isVarArg()) {
-          DEBUG(dbgs() << "Can not constant fold vararg function call.\n");
+          LLVM_DEBUG(dbgs() << "Can not constant fold vararg function call.\n");
           return false;
         }
 
@@ -474,21 +562,24 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
         // Execute the call, if successful, use the return value.
         ValueStack.emplace_back();
         if (!EvaluateFunction(Callee, RetVal, Formals)) {
-          DEBUG(dbgs() << "Failed to evaluate function.\n");
+          LLVM_DEBUG(dbgs() << "Failed to evaluate function.\n");
           return false;
         }
         ValueStack.pop_back();
-        InstResult = RetVal;
+        InstResult = castCallResultIfNeeded(CS.getCalledValue(), RetVal);
+        if (RetVal && !InstResult)
+          return false;
 
         if (InstResult) {
-          DEBUG(dbgs() << "Successfully evaluated function. Result: "
-                       << *InstResult << "\n\n");
+          LLVM_DEBUG(dbgs() << "Successfully evaluated function. Result: "
+                            << *InstResult << "\n\n");
         } else {
-          DEBUG(dbgs() << "Successfully evaluated function. Result: 0\n\n");
+          LLVM_DEBUG(dbgs()
+                     << "Successfully evaluated function. Result: 0\n\n");
         }
       }
     } else if (isa<TerminatorInst>(CurInst)) {
-      DEBUG(dbgs() << "Found a terminator instruction.\n");
+      LLVM_DEBUG(dbgs() << "Found a terminator instruction.\n");
 
       if (BranchInst *BI = dyn_cast<BranchInst>(CurInst)) {
         if (BI->isUnconditional()) {
@@ -515,17 +606,18 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
         NextBB = nullptr;
       } else {
         // invoke, unwind, resume, unreachable.
-        DEBUG(dbgs() << "Can not handle terminator.");
+        LLVM_DEBUG(dbgs() << "Can not handle terminator.");
         return false;  // Cannot handle this terminator.
       }
 
       // We succeeded at evaluating this block!
-      DEBUG(dbgs() << "Successfully evaluated block.\n");
+      LLVM_DEBUG(dbgs() << "Successfully evaluated block.\n");
       return true;
     } else {
       // Did not know how to evaluate this!
-      DEBUG(dbgs() << "Failed to evaluate block due to unhandled instruction."
-            "\n");
+      LLVM_DEBUG(
+          dbgs() << "Failed to evaluate block due to unhandled instruction."
+                    "\n");
       return false;
     }
 
@@ -539,7 +631,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
     // If we just processed an invoke, we finished evaluating the block.
     if (InvokeInst *II = dyn_cast<InvokeInst>(CurInst)) {
       NextBB = II->getNormalDest();
-      DEBUG(dbgs() << "Found an invoke instruction. Finished Block.\n\n");
+      LLVM_DEBUG(dbgs() << "Found an invoke instruction. Finished Block.\n\n");
       return true;
     }
 
@@ -578,7 +670,7 @@ bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,
 
   while (true) {
     BasicBlock *NextBB = nullptr; // Initialized to avoid compiler warnings.
-    DEBUG(dbgs() << "Trying to evaluate BB: " << *CurBB << "\n");
+    LLVM_DEBUG(dbgs() << "Trying to evaluate BB: " << *CurBB << "\n");
 
     if (!EvaluateBlock(CurInst, NextBB))
       return false;
diff --git a/lib/Transforms/Utils/FlattenCFG.cpp b/lib/Transforms/Utils/FlattenCFG.cpp
index 5fdcc6d1d727..3c6c9c9a5df4 100644
--- a/lib/Transforms/Utils/FlattenCFG.cpp
+++ b/lib/Transforms/Utils/FlattenCFG.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/IRBuilder.h"
@@ -24,7 +25,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 
 using namespace llvm;
@@ -36,16 +36,16 @@ namespace {
 class FlattenCFGOpt {
   AliasAnalysis *AA;
 
-  /// \brief Use parallel-and or parallel-or to generate conditions for
+  /// Use parallel-and or parallel-or to generate conditions for
   /// conditional branches.
   bool FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder);
 
-  /// \brief If \param BB is the merge block of an if-region, attempt to merge
+  /// If \param BB is the merge block of an if-region, attempt to merge
   /// the if-region with an adjacent if-region upstream if two if-regions
   /// contain identical instructions.
   bool MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder);
 
-  /// \brief Compare a pair of blocks: \p Block1 and \p Block2, which
+  /// Compare a pair of blocks: \p Block1 and \p Block2, which
   /// are from two if-regions whose entry blocks are \p Head1 and \p
   /// Head2.  \returns true if \p Block1 and \p Block2 contain identical
   /// instructions, and have no memory reference alias with \p Head2.
@@ -312,7 +312,7 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
     new UnreachableInst(CB->getContext(), CB);
   } while (Iteration);
 
-  DEBUG(dbgs() << "Use parallel and/or in:\n" << *FirstCondBlock);
+  LLVM_DEBUG(dbgs() << "Use parallel and/or in:\n" << *FirstCondBlock);
   return true;
 }
 
@@ -469,7 +469,7 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) {
   // Remove \param SecondEntryBlock
   SecondEntryBlock->dropAllReferences();
   SecondEntryBlock->eraseFromParent();
-  DEBUG(dbgs() << "If conditions merged into:\n" << *FirstEntryBlock);
+  LLVM_DEBUG(dbgs() << "If conditions merged into:\n" << *FirstEntryBlock);
   return true;
 }
 
diff --git a/lib/Transforms/Utils/FunctionComparator.cpp b/lib/Transforms/Utils/FunctionComparator.cpp
index bddcbd86e914..69203f9f2485 100644
--- a/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/lib/Transforms/Utils/FunctionComparator.cpp
@@ -18,7 +18,6 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -377,7 +376,7 @@ int FunctionComparator::cmpConstants(const Constant *L,
     }
   }
   default: // Unknown constant, abort.
-    DEBUG(dbgs() << "Looking at valueID " << L->getValueID() << "\n");
+    LLVM_DEBUG(dbgs() << "Looking at valueID " << L->getValueID() << "\n");
     llvm_unreachable("Constant ValueID not recognized.");
     return -1;
   }
@@ -710,7 +709,7 @@ int FunctionComparator::cmpInlineAsm(const InlineAsm *L,
     return Res;
   if (int Res = cmpNumbers(L->getDialect(), R->getDialect()))
     return Res;
-  llvm_unreachable("InlineAsm blocks were not uniqued.");
+  assert(L->getFunctionType() != R->getFunctionType());
   return 0;
 }
 
@@ -925,7 +924,7 @@ FunctionComparator::FunctionHash FunctionComparator::functionHash(Function &F) {
   H.add(F.arg_size());
 
   SmallVector<const BasicBlock *, 8> BBs;
-  SmallSet<const BasicBlock *, 16> VisitedBBs;
+  SmallPtrSet<const BasicBlock *, 16> VisitedBBs;
 
   // Walk the blocks in the same order as FunctionComparator::cmpBasicBlocks(),
   // accumulating the hash of the function "structure." (BB and opcode sequence)
diff --git a/lib/Transforms/Utils/FunctionImportUtils.cpp b/lib/Transforms/Utils/FunctionImportUtils.cpp
index 6b5f593073b4..479816a339d0 100644
--- a/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -206,15 +206,10 @@ void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) {
   // definition.
   if (GV.hasName()) {
     ValueInfo VI = ImportIndex.getValueInfo(GV.getGUID());
-    if (VI) {
-      // Need to check all summaries are local in case of hash collisions.
-      bool IsLocal = VI.getSummaryList().size() &&
-          llvm::all_of(VI.getSummaryList(),
-                       [](const std::unique_ptr<GlobalValueSummary> &Summary) {
-                         return Summary->isDSOLocal();
-                       });
-      if (IsLocal)
-        GV.setDSOLocal(true);
+    if (VI && VI.isDSOLocal()) {
+      GV.setDSOLocal(true);
+      if (GV.hasDLLImportStorageClass())
+        GV.setDLLStorageClass(GlobalValue::DefaultStorageClass);
     }
   }
 
diff --git a/lib/Transforms/Utils/GlobalStatus.cpp b/lib/Transforms/Utils/GlobalStatus.cpp
index 245fefb38ee8..ff6970db47da 100644
--- a/lib/Transforms/Utils/GlobalStatus.cpp
+++ b/lib/Transforms/Utils/GlobalStatus.cpp
@@ -60,7 +60,7 @@ bool llvm::isSafeToDestroyConstant(const Constant *C) {
 }
 
 static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
-                             SmallPtrSetImpl<const PHINode *> &PhiUsers) {
+                             SmallPtrSetImpl<const Value *> &VisitedUsers) {
   if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
     if (GV->isExternallyInitialized())
       GS.StoredType = GlobalStatus::StoredOnce;
@@ -75,7 +75,8 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
       if (!isa<PointerType>(CE->getType()))
         return true;
 
-      if (analyzeGlobalAux(CE, GS, PhiUsers))
+      // FIXME: Do we need to add constexpr selects to VisitedUsers?
+      if (analyzeGlobalAux(CE, GS, VisitedUsers))
         return true;
     } else if (const Instruction *I = dyn_cast<Instruction>(UR)) {
       if (!GS.HasMultipleAccessingFunctions) {
@@ -137,20 +138,18 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
             GS.StoredType = GlobalStatus::Stored;
           }
         }
-      } else if (isa<BitCastInst>(I)) {
-        if (analyzeGlobalAux(I, GS, PhiUsers))
+      } else if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) {
+        // Skip over bitcasts and GEPs; we don't care about the type or offset
+        // of the pointer.
+        if (analyzeGlobalAux(I, GS, VisitedUsers))
           return true;
-      } else if (isa<GetElementPtrInst>(I)) {
-        if (analyzeGlobalAux(I, GS, PhiUsers))
-          return true;
-      } else if (isa<SelectInst>(I)) {
-        if (analyzeGlobalAux(I, GS, PhiUsers))
-          return true;
-      } else if (const PHINode *PN = dyn_cast<PHINode>(I)) {
-        // PHI nodes we can check just like select or GEP instructions, but we
-        // have to be careful about infinite recursion.
-        if (PhiUsers.insert(PN).second) // Not already visited.
-          if (analyzeGlobalAux(I, GS, PhiUsers))
+      } else if (isa<SelectInst>(I) || isa<PHINode>(I)) {
+        // Look through selects and PHIs to find if the pointer is
+        // conditionally accessed. Make sure we only visit an instruction
+        // once; otherwise, we can get infinite recursion or exponential
+        // compile time.
+        if (VisitedUsers.insert(I).second)
+          if (analyzeGlobalAux(I, GS, VisitedUsers))
             return true;
       } else if (isa<CmpInst>(I)) {
         GS.IsCompared = true;
@@ -191,6 +190,6 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
 GlobalStatus::GlobalStatus() = default;
 
 bool GlobalStatus::analyzeGlobal(const Value *V, GlobalStatus &GS) {
-  SmallPtrSet<const PHINode *, 16> PhiUsers;
-  return analyzeGlobalAux(V, GS, PhiUsers);
+  SmallPtrSet<const Value *, 16> VisitedUsers;
+  return analyzeGlobalAux(V, GS, VisitedUsers);
 }
diff --git a/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp b/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
index b8c12ad5ea84..8382220fc9e1 100644
--- a/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
+++ b/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
@@ -161,7 +161,7 @@ void ImportedFunctionsInliningStatistics::dump(const bool Verbose) {
 
 void ImportedFunctionsInliningStatistics::calculateRealInlines() {
   // Removing duplicated Callers.
-  std::sort(NonImportedCallers.begin(), NonImportedCallers.end());
+  llvm::sort(NonImportedCallers.begin(), NonImportedCallers.end());
   NonImportedCallers.erase(
       std::unique(NonImportedCallers.begin(), NonImportedCallers.end()),
       NonImportedCallers.end());
@@ -190,13 +190,14 @@ ImportedFunctionsInliningStatistics::getSortedNodes() {
   for (const NodesMapTy::value_type& Node : NodesMap)
     SortedNodes.push_back(&Node);
 
-  std::sort(
+  llvm::sort(
       SortedNodes.begin(), SortedNodes.end(),
       [&](const SortedNodesTy::value_type &Lhs,
           const SortedNodesTy::value_type &Rhs) {
         if (Lhs->second->NumberOfInlines != Rhs->second->NumberOfInlines)
           return Lhs->second->NumberOfInlines > Rhs->second->NumberOfInlines;
-        if (Lhs->second->NumberOfRealInlines != Rhs->second->NumberOfRealInlines)
+        if (Lhs->second->NumberOfRealInlines !=
+            Rhs->second->NumberOfRealInlines)
           return Lhs->second->NumberOfRealInlines >
                  Rhs->second->NumberOfRealInlines;
         return Lhs->first() < Rhs->first();
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index fedf6e100d6c..0315aac1cf84 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -29,6 +29,7 @@
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
@@ -60,7 +61,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
 #include <cassert>
@@ -72,6 +72,7 @@
 #include <vector>
 
 using namespace llvm;
+using ProfileCount = Function::ProfileCount;
 
 static cl::opt<bool>
 EnableNoAliasConversion("enable-noalias-to-md-conversion", cl::init(true),
@@ -1247,7 +1248,7 @@ static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M,
   // Always generate a memcpy of alignment 1 here because we don't know
   // the alignment of the src pointer.  Other optimizations can infer
   // better alignment.
-  Builder.CreateMemCpy(Dst, Src, Size, /*Align=*/1);
+  Builder.CreateMemCpy(Dst, /*DstAlign*/1, Src, /*SrcAlign*/1, Size);
 }
 
 /// When inlining a call site that has a byval argument,
@@ -1431,29 +1432,29 @@ static void updateCallerBFI(BasicBlock *CallSiteBlock,
 
 /// Update the branch metadata for cloned call instructions.
 static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap,
-                              const Optional<uint64_t> &CalleeEntryCount,
+                              const ProfileCount &CalleeEntryCount,
                               const Instruction *TheCall,
                               ProfileSummaryInfo *PSI,
                               BlockFrequencyInfo *CallerBFI) {
-  if (!CalleeEntryCount.hasValue() || CalleeEntryCount.getValue() < 1)
+  if (!CalleeEntryCount.hasValue() || CalleeEntryCount.isSynthetic() ||
+      CalleeEntryCount.getCount() < 1)
     return;
-  Optional<uint64_t> CallSiteCount =
-      PSI ? PSI->getProfileCount(TheCall, CallerBFI) : None;
+  auto CallSiteCount = PSI ? PSI->getProfileCount(TheCall, CallerBFI) : None;
   uint64_t CallCount =
       std::min(CallSiteCount.hasValue() ? CallSiteCount.getValue() : 0,
-               CalleeEntryCount.getValue());
+               CalleeEntryCount.getCount());
 
   for (auto const &Entry : VMap)
     if (isa<CallInst>(Entry.first))
       if (auto *CI = dyn_cast_or_null<CallInst>(Entry.second))
-        CI->updateProfWeight(CallCount, CalleeEntryCount.getValue());
+        CI->updateProfWeight(CallCount, CalleeEntryCount.getCount());
   for (BasicBlock &BB : *Callee)
     // No need to update the callsite if it is pruned during inlining.
     if (VMap.count(&BB))
       for (Instruction &I : BB)
         if (CallInst *CI = dyn_cast<CallInst>(&I))
-          CI->updateProfWeight(CalleeEntryCount.getValue() - CallCount,
-                               CalleeEntryCount.getValue());
+          CI->updateProfWeight(CalleeEntryCount.getCount() - CallCount,
+                               CalleeEntryCount.getCount());
 }
 
 /// Update the entry count of callee after inlining.
@@ -1467,18 +1468,19 @@ static void updateCalleeCount(BlockFrequencyInfo *CallerBFI, BasicBlock *CallBB,
   // callsite is M, the new callee count is set to N - M. M is estimated from
   // the caller's entry count, its entry block frequency and the block frequency
   // of the callsite.
-  Optional<uint64_t> CalleeCount = Callee->getEntryCount();
+  auto CalleeCount = Callee->getEntryCount();
   if (!CalleeCount.hasValue() || !PSI)
     return;
-  Optional<uint64_t> CallCount = PSI->getProfileCount(CallInst, CallerBFI);
+  auto CallCount = PSI->getProfileCount(CallInst, CallerBFI);
   if (!CallCount.hasValue())
     return;
   // Since CallSiteCount is an estimate, it could exceed the original callee
   // count and has to be set to 0.
-  if (CallCount.getValue() > CalleeCount.getValue())
-    Callee->setEntryCount(0);
+  if (CallCount.getValue() > CalleeCount.getCount())
+    CalleeCount.setCount(0);
   else
-    Callee->setEntryCount(CalleeCount.getValue() - CallCount.getValue());
+    CalleeCount.setCount(CalleeCount.getCount() - CallCount.getValue());
+  Callee->setEntryCount(CalleeCount);
 }
 
 /// This function inlines the called function into the basic block of the
@@ -1500,10 +1502,9 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   IFI.reset();
 
   Function *CalledFunc = CS.getCalledFunction();
-  if (!CalledFunc ||              // Can't inline external function or indirect
-      CalledFunc->isDeclaration() ||
-      (!ForwardVarArgsTo && CalledFunc->isVarArg())) // call, or call to a vararg function!
-      return false;
+  if (!CalledFunc ||               // Can't inline external function or indirect
+      CalledFunc->isDeclaration()) // call!
+    return false;
 
   // The inliner does not know how to inline through calls with operand bundles
   // in general ...
@@ -1568,7 +1569,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   Instruction *CallSiteEHPad = nullptr;
   if (CallerPersonality) {
     EHPersonality Personality = classifyEHPersonality(CallerPersonality);
-    if (isFuncletEHPersonality(Personality)) {
+    if (isScopedEHPersonality(Personality)) {
       Optional<OperandBundleUse> ParentFunclet =
           CS.getOperandBundle(LLVMContext::OB_funclet);
       if (ParentFunclet)
@@ -1630,9 +1631,6 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
 
     auto &DL = Caller->getParent()->getDataLayout();
 
-    assert((CalledFunc->arg_size() == CS.arg_size() || ForwardVarArgsTo) &&
-           "Varargs calls can only be inlined if the Varargs are forwarded!");
-
     // Calculate the vector of arguments to pass into the function cloner, which
     // matches up the formal to the actual argument values.
     CallSite::arg_iterator AI = CS.arg_begin();
@@ -1815,9 +1813,12 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   }
 
   SmallVector<Value*,4> VarArgsToForward;
+  SmallVector<AttributeSet, 4> VarArgsAttrs;
   for (unsigned i = CalledFunc->getFunctionType()->getNumParams();
-       i < CS.getNumArgOperands(); i++)
+       i < CS.getNumArgOperands(); i++) {
     VarArgsToForward.push_back(CS.getArgOperand(i));
+    VarArgsAttrs.push_back(CS.getAttributes().getParamAttributes(i));
+  }
 
   bool InlinedMustTailCalls = false, InlinedDeoptimizeCalls = false;
   if (InlinedFunctionInfo.ContainsCalls) {
@@ -1825,6 +1826,10 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     if (CallInst *CI = dyn_cast<CallInst>(TheCall))
       CallSiteTailKind = CI->getTailCallKind();
 
+    // For inlining purposes, the "notail" marker is the same as no marker.
+    if (CallSiteTailKind == CallInst::TCK_NoTail)
+      CallSiteTailKind = CallInst::TCK_None;
+
     for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E;
          ++BB) {
       for (auto II = BB->begin(); II != BB->end();) {
@@ -1833,6 +1838,40 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
         if (!CI)
           continue;
 
+        // Forward varargs from inlined call site to calls to the
+        // ForwardVarArgsTo function, if requested, and to musttail calls.
+        if (!VarArgsToForward.empty() &&
+            ((ForwardVarArgsTo &&
+              CI->getCalledFunction() == ForwardVarArgsTo) ||
+             CI->isMustTailCall())) {
+          // Collect attributes for non-vararg parameters.
+          AttributeList Attrs = CI->getAttributes();
+          SmallVector<AttributeSet, 8> ArgAttrs;
+          if (!Attrs.isEmpty() || !VarArgsAttrs.empty()) {
+            for (unsigned ArgNo = 0;
+                 ArgNo < CI->getFunctionType()->getNumParams(); ++ArgNo)
+              ArgAttrs.push_back(Attrs.getParamAttributes(ArgNo));
+          }
+
+          // Add VarArg attributes.
+          ArgAttrs.append(VarArgsAttrs.begin(), VarArgsAttrs.end());
+          Attrs = AttributeList::get(CI->getContext(), Attrs.getFnAttributes(),
+                                     Attrs.getRetAttributes(), ArgAttrs);
+          // Add VarArgs to existing parameters.
+          SmallVector<Value *, 6> Params(CI->arg_operands());
+          Params.append(VarArgsToForward.begin(), VarArgsToForward.end());
+          CallInst *NewCI =
+              CallInst::Create(CI->getCalledFunction() ? CI->getCalledFunction()
+                                                       : CI->getCalledValue(),
+                               Params, "", CI);
+          NewCI->setDebugLoc(CI->getDebugLoc());
+          NewCI->setAttributes(Attrs);
+          NewCI->setCallingConv(CI->getCallingConv());
+          CI->replaceAllUsesWith(NewCI);
+          CI->eraseFromParent();
+          CI = NewCI;
+        }
+
         if (Function *F = CI->getCalledFunction())
           InlinedDeoptimizeCalls |=
               F->getIntrinsicID() == Intrinsic::experimental_deoptimize;
@@ -1850,6 +1889,8 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
         //    f -> musttail g ->     tail f  ==>  f ->     tail f
         //    f ->          g -> musttail f  ==>  f ->          f
         //    f ->          g ->     tail f  ==>  f ->          f
+        //
+        // Inlined notail calls should remain notail calls.
         CallInst::TailCallKind ChildTCK = CI->getTailCallKind();
         if (ChildTCK != CallInst::TCK_NoTail)
           ChildTCK = std::min(CallSiteTailKind, ChildTCK);
@@ -1860,16 +1901,6 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
         // 'nounwind'.
         if (MarkNoUnwind)
           CI->setDoesNotThrow();
-
-        if (ForwardVarArgsTo && !VarArgsToForward.empty() &&
-            CI->getCalledFunction() == ForwardVarArgsTo) {
-          SmallVector<Value*, 6> Params(CI->arg_operands());
-          Params.append(VarArgsToForward.begin(), VarArgsToForward.end());
-          CallInst *Call = CallInst::Create(CI->getCalledFunction(), Params, "", CI);
-          Call->setDebugLoc(CI->getDebugLoc());
-          CI->replaceAllUsesWith(Call);
-          CI->eraseFromParent();
-        }
       }
     }
   }
diff --git a/lib/Transforms/Utils/InstructionNamer.cpp b/lib/Transforms/Utils/InstructionNamer.cpp
index 23ec45edb3ef..003721f2b939 100644
--- a/lib/Transforms/Utils/InstructionNamer.cpp
+++ b/lib/Transforms/Utils/InstructionNamer.cpp
@@ -17,7 +17,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 using namespace llvm;
 
 namespace {
diff --git a/lib/Transforms/Utils/IntegerDivision.cpp b/lib/Transforms/Utils/IntegerDivision.cpp
index 5a90dcb033b2..3fbb3487884b 100644
--- a/lib/Transforms/Utils/IntegerDivision.cpp
+++ b/lib/Transforms/Utils/IntegerDivision.cpp
@@ -372,7 +372,7 @@ static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor,
 /// information about the operands are known. Implements both 32bit and 64bit
 /// scalar division.
 ///
-/// @brief Replace Rem with generated code.
+/// Replace Rem with generated code.
 bool llvm::expandRemainder(BinaryOperator *Rem) {
   assert((Rem->getOpcode() == Instruction::SRem ||
           Rem->getOpcode() == Instruction::URem) &&
@@ -430,7 +430,7 @@ bool llvm::expandRemainder(BinaryOperator *Rem) {
 /// when more information about the operands are known. Implements both
 /// 32bit and 64bit scalar division.
 ///
-/// @brief Replace Div with generated code.
+/// Replace Div with generated code.
 bool llvm::expandDivision(BinaryOperator *Div) {
   assert((Div->getOpcode() == Instruction::SDiv ||
           Div->getOpcode() == Instruction::UDiv) &&
@@ -482,7 +482,7 @@ bool llvm::expandDivision(BinaryOperator *Div) {
 /// that have no or very little suppport for smaller than 32 bit integer 
 /// arithmetic.
 ///
-/// @brief Replace Rem with emulation code.
+/// Replace Rem with emulation code.
 bool llvm::expandRemainderUpTo32Bits(BinaryOperator *Rem) {
   assert((Rem->getOpcode() == Instruction::SRem ||
           Rem->getOpcode() == Instruction::URem) &&
@@ -531,7 +531,7 @@ bool llvm::expandRemainderUpTo32Bits(BinaryOperator *Rem) {
 /// 64 bits. Uses the above routines and extends the inputs/truncates the
 /// outputs to operate in 64 bits.
 ///
-/// @brief Replace Rem with emulation code.
+/// Replace Rem with emulation code.
 bool llvm::expandRemainderUpTo64Bits(BinaryOperator *Rem) {
   assert((Rem->getOpcode() == Instruction::SRem ||
           Rem->getOpcode() == Instruction::URem) &&
@@ -580,7 +580,7 @@ bool llvm::expandRemainderUpTo64Bits(BinaryOperator *Rem) {
 /// in 32 bits; that is, these routines are good for targets that have no
 /// or very little support for smaller than 32 bit integer arithmetic.
 ///
-/// @brief Replace Div with emulation code.
+/// Replace Div with emulation code.
 bool llvm::expandDivisionUpTo32Bits(BinaryOperator *Div) {
   assert((Div->getOpcode() == Instruction::SDiv ||
           Div->getOpcode() == Instruction::UDiv) &&
@@ -628,7 +628,7 @@ bool llvm::expandDivisionUpTo32Bits(BinaryOperator *Div) {
 /// above routines and extends the inputs/truncates the outputs to operate
 /// in 64 bits.
 ///
-/// @brief Replace Div with emulation code.
+/// Replace Div with emulation code.
 bool llvm::expandDivisionUpTo64Bits(BinaryOperator *Div) {
   assert((Div->getOpcode() == Instruction::SDiv ||
           Div->getOpcode() == Instruction::UDiv) &&
diff --git a/lib/Transforms/Utils/LCSSA.cpp b/lib/Transforms/Utils/LCSSA.cpp
index ae0e2bb6c280..956d0387c7a8 100644
--- a/lib/Transforms/Utils/LCSSA.cpp
+++ b/lib/Transforms/Utils/LCSSA.cpp
@@ -36,13 +36,14 @@
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PredIteratorCache.h"
 #include "llvm/Pass.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 using namespace llvm;
@@ -214,18 +215,27 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
         Worklist.push_back(PostProcessPN);
 
     // Keep track of PHI nodes that we want to remove because they did not have
-    // any uses rewritten.
+    // any uses rewritten. If the new PHI is used, store it so that we can
+    // try to propagate dbg.value intrinsics to it.
+    SmallVector<PHINode *, 2> NeedDbgValues;
     for (PHINode *PN : AddedPHIs)
       if (PN->use_empty())
         PHIsToRemove.insert(PN);
-
+      else
+        NeedDbgValues.push_back(PN);
+    insertDebugValuesForPHIs(InstBB, NeedDbgValues);
     Changed = true;
   }
-  // Remove PHI nodes that did not have any uses rewritten.
-  for (PHINode *PN : PHIsToRemove) {
-    assert (PN->use_empty() && "Trying to remove a phi with uses.");
-    PN->eraseFromParent();
-  }
+  // Remove PHI nodes that did not have any uses rewritten. We need to redo the
+  // use_empty() check here, because even if the PHI node wasn't used when added
+  // to PHIsToRemove, later added PHI nodes can be using it.  This cleanup is
+  // not guaranteed to handle trees/cycles of PHI nodes that only are used by
+  // each other. Such situations has only been noticed when the input IR
+  // contains unreachable code, and leaving some extra redundant PHI nodes in
+  // such situations is considered a minor problem.
+  for (PHINode *PN : PHIsToRemove)
+    if (PN->use_empty())
+      PN->eraseFromParent();
   return Changed;
 }
 
diff --git a/lib/Transforms/Utils/LibCallsShrinkWrap.cpp b/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
index 42aca757c2af..9832a6f24e1f 100644
--- a/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
+++ b/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
@@ -79,11 +79,11 @@ public:
   bool perform() {
     bool Changed = false;
     for (auto &CI : WorkList) {
-      DEBUG(dbgs() << "CDCE calls: " << CI->getCalledFunction()->getName()
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "CDCE calls: " << CI->getCalledFunction()->getName()
+                        << "\n");
       if (perform(CI)) {
         Changed = true;
-        DEBUG(dbgs() << "Transformed\n");
+        LLVM_DEBUG(dbgs() << "Transformed\n");
       }
     }
     return Changed;
@@ -421,7 +421,7 @@ Value *LibCallsShrinkWrap::generateCondForPow(CallInst *CI,
                                               const LibFunc &Func) {
   // FIXME: LibFunc_powf and powl TBD.
   if (Func != LibFunc_pow) {
-    DEBUG(dbgs() << "Not handled powf() and powl()\n");
+    LLVM_DEBUG(dbgs() << "Not handled powf() and powl()\n");
     return nullptr;
   }
 
@@ -433,7 +433,7 @@ Value *LibCallsShrinkWrap::generateCondForPow(CallInst *CI,
   if (ConstantFP *CF = dyn_cast<ConstantFP>(Base)) {
     double D = CF->getValueAPF().convertToDouble();
     if (D < 1.0f || D > APInt::getMaxValue(8).getZExtValue()) {
-      DEBUG(dbgs() << "Not handled pow(): constant base out of range\n");
+      LLVM_DEBUG(dbgs() << "Not handled pow(): constant base out of range\n");
       return nullptr;
     }
 
@@ -447,7 +447,7 @@ Value *LibCallsShrinkWrap::generateCondForPow(CallInst *CI,
   // If the Base value coming from an integer type.
   Instruction *I = dyn_cast<Instruction>(Base);
   if (!I) {
-    DEBUG(dbgs() << "Not handled pow(): FP type base\n");
+    LLVM_DEBUG(dbgs() << "Not handled pow(): FP type base\n");
     return nullptr;
   }
   unsigned Opcode = I->getOpcode();
@@ -461,7 +461,7 @@ Value *LibCallsShrinkWrap::generateCondForPow(CallInst *CI,
     else if (BW == 32)
       UpperV = 32.0f;
     else {
-      DEBUG(dbgs() << "Not handled pow(): type too wide\n");
+      LLVM_DEBUG(dbgs() << "Not handled pow(): type too wide\n");
       return nullptr;
     }
 
@@ -477,7 +477,7 @@ Value *LibCallsShrinkWrap::generateCondForPow(CallInst *CI,
     Value *Cond0 = BBBuilder.CreateFCmp(CmpInst::FCMP_OLE, Base, V0);
     return BBBuilder.CreateOr(Cond0, Cond);
   }
-  DEBUG(dbgs() << "Not handled pow(): base not from integer convert\n");
+  LLVM_DEBUG(dbgs() << "Not handled pow(): base not from integer convert\n");
   return nullptr;
 }
 
@@ -496,9 +496,9 @@ void LibCallsShrinkWrap::shrinkWrapCI(CallInst *CI, Value *Cond) {
   SuccBB->setName("cdce.end");
   CI->removeFromParent();
   CallBB->getInstList().insert(CallBB->getFirstInsertionPt(), CI);
-  DEBUG(dbgs() << "== Basic Block After ==");
-  DEBUG(dbgs() << *CallBB->getSinglePredecessor() << *CallBB
-               << *CallBB->getSingleSuccessor() << "\n");
+  LLVM_DEBUG(dbgs() << "== Basic Block After ==");
+  LLVM_DEBUG(dbgs() << *CallBB->getSinglePredecessor() << *CallBB
+                    << *CallBB->getSingleSuccessor() << "\n");
 }
 
 // Perform the transformation to a single candidate.
@@ -529,10 +529,7 @@ static bool runImpl(Function &F, const TargetLibraryInfo &TLI,
   bool Changed = CCDCE.perform();
 
 // Verify the dominator after we've updated it locally.
-#ifndef NDEBUG
-  if (DT)
-    DT->verifyDomTree();
-#endif
+  assert(!DT || DT->verify(DominatorTree::VerificationLevel::Fast));
   return Changed;
 }
 
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index a1961eecb391..ae3cb077a3af 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -73,6 +73,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
 #include <cassert>
 #include <climits>
@@ -100,26 +101,23 @@ STATISTIC(NumRemoved, "Number of unreachable basic blocks removed");
 /// conditions and indirectbr addresses this might make dead if
 /// DeleteDeadConditions is true.
 bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
-                                  const TargetLibraryInfo *TLI) {
+                                  const TargetLibraryInfo *TLI,
+                                  DeferredDominance *DDT) {
   TerminatorInst *T = BB->getTerminator();
   IRBuilder<> Builder(T);
 
   // Branch - See if we are conditional jumping on constant
-  if (BranchInst *BI = dyn_cast<BranchInst>(T)) {
+  if (auto *BI = dyn_cast<BranchInst>(T)) {
     if (BI->isUnconditional()) return false;  // Can't optimize uncond branch
     BasicBlock *Dest1 = BI->getSuccessor(0);
     BasicBlock *Dest2 = BI->getSuccessor(1);
 
-    if (ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition())) {
+    if (auto *Cond = dyn_cast<ConstantInt>(BI->getCondition())) {
       // Are we branching on constant?
       // YES.  Change to unconditional branch...
       BasicBlock *Destination = Cond->getZExtValue() ? Dest1 : Dest2;
       BasicBlock *OldDest     = Cond->getZExtValue() ? Dest2 : Dest1;
 
-      //cerr << "Function: " << T->getParent()->getParent()
-      //     << "\nRemoving branch from " << T->getParent()
-      //     << "\n\nTo: " << OldDest << endl;
-
       // Let the basic block know that we are letting go of it.  Based on this,
       // it will adjust it's PHI nodes.
       OldDest->removePredecessor(BB);
@@ -127,6 +125,8 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
       // Replace the conditional branch with an unconditional one.
       Builder.CreateBr(Destination);
       BI->eraseFromParent();
+      if (DDT)
+        DDT->deleteEdge(BB, OldDest);
       return true;
     }
 
@@ -150,10 +150,10 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
     return false;
   }
 
-  if (SwitchInst *SI = dyn_cast<SwitchInst>(T)) {
+  if (auto *SI = dyn_cast<SwitchInst>(T)) {
     // If we are switching on a constant, we can convert the switch to an
     // unconditional branch.
-    ConstantInt *CI = dyn_cast<ConstantInt>(SI->getCondition());
+    auto *CI = dyn_cast<ConstantInt>(SI->getCondition());
     BasicBlock *DefaultDest = SI->getDefaultDest();
     BasicBlock *TheOnlyDest = DefaultDest;
 
@@ -197,9 +197,12 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
                           createBranchWeights(Weights));
         }
         // Remove this entry.
-        DefaultDest->removePredecessor(SI->getParent());
+        BasicBlock *ParentBB = SI->getParent();
+        DefaultDest->removePredecessor(ParentBB);
         i = SI->removeCase(i);
         e = SI->case_end();
+        if (DDT)
+          DDT->deleteEdge(ParentBB, DefaultDest);
         continue;
       }
 
@@ -225,14 +228,20 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
       // Insert the new branch.
       Builder.CreateBr(TheOnlyDest);
       BasicBlock *BB = SI->getParent();
+      std::vector <DominatorTree::UpdateType> Updates;
+      if (DDT)
+        Updates.reserve(SI->getNumSuccessors() - 1);
 
       // Remove entries from PHI nodes which we no longer branch to...
       for (BasicBlock *Succ : SI->successors()) {
         // Found case matching a constant operand?
-        if (Succ == TheOnlyDest)
+        if (Succ == TheOnlyDest) {
           TheOnlyDest = nullptr; // Don't modify the first branch to TheOnlyDest
-        else
+        } else {
           Succ->removePredecessor(BB);
+          if (DDT)
+            Updates.push_back({DominatorTree::Delete, BB, Succ});
+        }
       }
 
       // Delete the old switch.
@@ -240,6 +249,8 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
       SI->eraseFromParent();
       if (DeleteDeadConditions)
         RecursivelyDeleteTriviallyDeadInstructions(Cond, TLI);
+      if (DDT)
+        DDT->applyUpdates(Updates);
       return true;
     }
 
@@ -280,19 +291,28 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
     return false;
   }
 
-  if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(T)) {
+  if (auto *IBI = dyn_cast<IndirectBrInst>(T)) {
     // indirectbr blockaddress(@F, @BB) -> br label @BB
-    if (BlockAddress *BA =
+    if (auto *BA =
           dyn_cast<BlockAddress>(IBI->getAddress()->stripPointerCasts())) {
       BasicBlock *TheOnlyDest = BA->getBasicBlock();
+      std::vector <DominatorTree::UpdateType> Updates;
+      if (DDT)
+        Updates.reserve(IBI->getNumDestinations() - 1);
+
       // Insert the new branch.
       Builder.CreateBr(TheOnlyDest);
 
       for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
-        if (IBI->getDestination(i) == TheOnlyDest)
+        if (IBI->getDestination(i) == TheOnlyDest) {
           TheOnlyDest = nullptr;
-        else
-          IBI->getDestination(i)->removePredecessor(IBI->getParent());
+        } else {
+          BasicBlock *ParentBB = IBI->getParent();
+          BasicBlock *DestBB = IBI->getDestination(i);
+          DestBB->removePredecessor(ParentBB);
+          if (DDT)
+            Updates.push_back({DominatorTree::Delete, ParentBB, DestBB});
+        }
       }
       Value *Address = IBI->getAddress();
       IBI->eraseFromParent();
@@ -307,6 +327,8 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
         new UnreachableInst(BB->getContext(), BB);
       }
 
+      if (DDT)
+        DDT->applyUpdates(Updates);
       return true;
     }
   }
@@ -350,6 +372,11 @@ bool llvm::wouldInstructionBeTriviallyDead(Instruction *I,
       return false;
     return true;
   }
+  if (DbgLabelInst *DLI = dyn_cast<DbgLabelInst>(I)) {
+    if (DLI->getLabel())
+      return false;
+    return true;
+  }
 
   if (!I->mayHaveSideEffects())
     return true;
@@ -357,8 +384,9 @@ bool llvm::wouldInstructionBeTriviallyDead(Instruction *I,
   // Special case intrinsics that "may have side effects" but can be deleted
   // when dead.
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
-    // Safe to delete llvm.stacksave if dead.
-    if (II->getIntrinsicID() == Intrinsic::stacksave)
+    // Safe to delete llvm.stacksave and launder.invariant.group if dead.
+    if (II->getIntrinsicID() == Intrinsic::stacksave ||
+        II->getIntrinsicID() == Intrinsic::launder_invariant_group)
       return true;
 
     // Lifetime intrinsics are dead when their right-hand is undef.
@@ -406,17 +434,31 @@ llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V,
 
   SmallVector<Instruction*, 16> DeadInsts;
   DeadInsts.push_back(I);
+  RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);
 
-  do {
-    I = DeadInsts.pop_back_val();
+  return true;
+}
+
+void llvm::RecursivelyDeleteTriviallyDeadInstructions(
+    SmallVectorImpl<Instruction *> &DeadInsts, const TargetLibraryInfo *TLI) {
+  // Process the dead instruction list until empty.
+  while (!DeadInsts.empty()) {
+    Instruction &I = *DeadInsts.pop_back_val();
+    assert(I.use_empty() && "Instructions with uses are not dead.");
+    assert(isInstructionTriviallyDead(&I, TLI) &&
+           "Live instruction found in dead worklist!");
+
+    // Don't lose the debug info while deleting the instructions.
+    salvageDebugInfo(I);
 
     // Null out all of the instruction's operands to see if any operand becomes
     // dead as we go.
-    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-      Value *OpV = I->getOperand(i);
-      I->setOperand(i, nullptr);
+    for (Use &OpU : I.operands()) {
+      Value *OpV = OpU.get();
+      OpU.set(nullptr);
 
-      if (!OpV->use_empty()) continue;
+      if (!OpV->use_empty())
+        continue;
 
       // If the operand is an instruction that became dead as we nulled out the
       // operand, and if it is 'trivially' dead, delete it in a future loop
@@ -426,10 +468,8 @@ llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V,
           DeadInsts.push_back(OpI);
     }
 
-    I->eraseFromParent();
-  } while (!DeadInsts.empty());
-
-  return true;
+    I.eraseFromParent();
+  }
 }
 
 /// areAllUsesEqual - Check whether the uses of a value are all the same.
@@ -481,6 +521,8 @@ simplifyAndDCEInstruction(Instruction *I,
                           const DataLayout &DL,
                           const TargetLibraryInfo *TLI) {
   if (isInstructionTriviallyDead(I, TLI)) {
+    salvageDebugInfo(*I);
+
     // Null out all of the instruction's operands to see if any operand becomes
     // dead as we go.
     for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
@@ -583,7 +625,8 @@ bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB,
 ///
 /// .. and delete the predecessor corresponding to the '1', this will attempt to
 /// recursively fold the and to 0.
-void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred) {
+void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred,
+                                        DeferredDominance *DDT) {
   // This only adjusts blocks with PHI nodes.
   if (!isa<PHINode>(BB->begin()))
     return;
@@ -606,13 +649,18 @@ void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred) {
     // of the block.
     if (PhiIt != OldPhiIt) PhiIt = &BB->front();
   }
+  if (DDT)
+    DDT->deleteEdge(Pred, BB);
 }
 
 /// MergeBasicBlockIntoOnlyPred - DestBB is a block with one predecessor and its
 /// predecessor is known to have one successor (DestBB!).  Eliminate the edge
 /// between them, moving the instructions in the predecessor into DestBB and
 /// deleting the predecessor block.
-void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, DominatorTree *DT) {
+void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, DominatorTree *DT,
+                                       DeferredDominance *DDT) {
+  assert(!(DT && DDT) && "Cannot call with both DT and DDT.");
+
   // If BB has single-entry PHI nodes, fold them.
   while (PHINode *PN = dyn_cast<PHINode>(DestBB->begin())) {
     Value *NewVal = PN->getIncomingValue(0);
@@ -625,6 +673,24 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, DominatorTree *DT) {
   BasicBlock *PredBB = DestBB->getSinglePredecessor();
   assert(PredBB && "Block doesn't have a single predecessor!");
 
+  bool ReplaceEntryBB = false;
+  if (PredBB == &DestBB->getParent()->getEntryBlock())
+    ReplaceEntryBB = true;
+
+  // Deferred DT update: Collect all the edges that enter PredBB. These
+  // dominator edges will be redirected to DestBB.
+  std::vector <DominatorTree::UpdateType> Updates;
+  if (DDT && !ReplaceEntryBB) {
+    Updates.reserve(1 + (2 * pred_size(PredBB)));
+    Updates.push_back({DominatorTree::Delete, PredBB, DestBB});
+    for (auto I = pred_begin(PredBB), E = pred_end(PredBB); I != E; ++I) {
+      Updates.push_back({DominatorTree::Delete, *I, PredBB});
+      // This predecessor of PredBB may already have DestBB as a successor.
+      if (llvm::find(successors(*I), DestBB) == succ_end(*I))
+        Updates.push_back({DominatorTree::Insert, *I, DestBB});
+    }
+  }
+
   // Zap anything that took the address of DestBB.  Not doing this will give the
   // address an invalid value.
   if (DestBB->hasAddressTaken()) {
@@ -645,7 +711,7 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, DominatorTree *DT) {
 
   // If the PredBB is the entry block of the function, move DestBB up to
   // become the entry block after we erase PredBB.
-  if (PredBB == &DestBB->getParent()->getEntryBlock())
+  if (ReplaceEntryBB)
     DestBB->moveAfter(PredBB);
 
   if (DT) {
@@ -657,8 +723,19 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, DominatorTree *DT) {
       DT->eraseNode(PredBB);
     }
   }
-  // Nuke BB.
-  PredBB->eraseFromParent();
+
+  if (DDT) {
+    DDT->deleteBB(PredBB); // Deferred deletion of BB.
+    if (ReplaceEntryBB)
+      // The entry block was removed and there is no external interface for the
+      // dominator tree to be notified of this change. In this corner-case we
+      // recalculate the entire tree.
+      DDT->recalculate(*(DestBB->getParent()));
+    else
+      DDT->applyUpdates(Updates);
+  } else {
+    PredBB->eraseFromParent(); // Nuke BB.
+  }
 }
 
 /// CanMergeValues - Return true if we can choose one of these values to use
@@ -675,8 +752,8 @@ static bool CanMergeValues(Value *First, Value *Second) {
 static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
   assert(*succ_begin(BB) == Succ && "Succ is not successor of BB!");
 
-  DEBUG(dbgs() << "Looking to fold " << BB->getName() << " into "
-        << Succ->getName() << "\n");
+  LLVM_DEBUG(dbgs() << "Looking to fold " << BB->getName() << " into "
+                    << Succ->getName() << "\n");
   // Shortcut, if there is only a single predecessor it must be BB and merging
   // is always safe
   if (Succ->getSinglePredecessor()) return true;
@@ -699,10 +776,11 @@ static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
         if (BBPreds.count(IBB) &&
             !CanMergeValues(BBPN->getIncomingValueForBlock(IBB),
                             PN->getIncomingValue(PI))) {
-          DEBUG(dbgs() << "Can't fold, phi node " << PN->getName() << " in "
-                << Succ->getName() << " is conflicting with "
-                << BBPN->getName() << " with regard to common predecessor "
-                << IBB->getName() << "\n");
+          LLVM_DEBUG(dbgs()
+                     << "Can't fold, phi node " << PN->getName() << " in "
+                     << Succ->getName() << " is conflicting with "
+                     << BBPN->getName() << " with regard to common predecessor "
+                     << IBB->getName() << "\n");
           return false;
         }
       }
@@ -715,9 +793,10 @@ static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
         BasicBlock *IBB = PN->getIncomingBlock(PI);
         if (BBPreds.count(IBB) &&
             !CanMergeValues(Val, PN->getIncomingValue(PI))) {
-          DEBUG(dbgs() << "Can't fold, phi node " << PN->getName() << " in "
-                << Succ->getName() << " is conflicting with regard to common "
-                << "predecessor " << IBB->getName() << "\n");
+          LLVM_DEBUG(dbgs() << "Can't fold, phi node " << PN->getName()
+                            << " in " << Succ->getName()
+                            << " is conflicting with regard to common "
+                            << "predecessor " << IBB->getName() << "\n");
           return false;
         }
       }
@@ -730,7 +809,7 @@ static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
 using PredBlockVector = SmallVector<BasicBlock *, 16>;
 using IncomingValueMap = DenseMap<BasicBlock *, Value *>;
 
-/// \brief Determines the value to use as the phi node input for a block.
+/// Determines the value to use as the phi node input for a block.
 ///
 /// Select between \p OldVal any value that we know flows from \p BB
 /// to a particular phi on the basis of which one (if either) is not
@@ -759,7 +838,7 @@ static Value *selectIncomingValueForBlock(Value *OldVal, BasicBlock *BB,
   return OldVal;
 }
 
-/// \brief Create a map from block to value for the operands of a
+/// Create a map from block to value for the operands of a
 /// given phi.
 ///
 /// Create a map from block to value for each non-undef value flowing
@@ -778,7 +857,7 @@ static void gatherIncomingValuesToPhi(PHINode *PN,
   }
 }
 
-/// \brief Replace the incoming undef values to a phi with the values
+/// Replace the incoming undef values to a phi with the values
 /// from a block-to-value map.
 ///
 /// \param PN The phi we are replacing the undefs in.
@@ -798,7 +877,7 @@ static void replaceUndefValuesInPhi(PHINode *PN,
   }
 }
 
-/// \brief Replace a value flowing from a block to a phi with
+/// Replace a value flowing from a block to a phi with
 /// potentially multiple instances of that value flowing from the
 /// block's predecessors to the phi.
 ///
@@ -865,7 +944,8 @@ static void redirectValuesFromPredecessorsToPhi(BasicBlock *BB,
 /// potential side-effect free intrinsics and the branch.  If possible,
 /// eliminate BB by rewriting all the predecessors to branch to the successor
 /// block and return true.  If we can't transform, return false.
-bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) {
+bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
+                                                   DeferredDominance *DDT) {
   assert(BB != &BB->getParent()->getEntryBlock() &&
          "TryToSimplifyUncondBranchFromEmptyBlock called on entry block!");
 
@@ -904,7 +984,20 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) {
     }
   }
 
-  DEBUG(dbgs() << "Killing Trivial BB: \n" << *BB);
+  LLVM_DEBUG(dbgs() << "Killing Trivial BB: \n" << *BB);
+
+  std::vector<DominatorTree::UpdateType> Updates;
+  if (DDT) {
+    Updates.reserve(1 + (2 * pred_size(BB)));
+    Updates.push_back({DominatorTree::Delete, BB, Succ});
+    // All predecessors of BB will be moved to Succ.
+    for (auto I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
+      Updates.push_back({DominatorTree::Delete, *I, BB});
+      // This predecessor of BB may already have Succ as a successor.
+      if (llvm::find(successors(*I), Succ) == succ_end(*I))
+        Updates.push_back({DominatorTree::Insert, *I, Succ});
+    }
+  }
 
   if (isa<PHINode>(Succ->begin())) {
     // If there is more than one pred of succ, and there are PHI nodes in
@@ -950,7 +1043,13 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) {
   // Everything that jumped to BB now goes to Succ.
   BB->replaceAllUsesWith(Succ);
   if (!Succ->hasName()) Succ->takeName(BB);
-  BB->eraseFromParent();              // Delete the old basic block.
+
+  if (DDT) {
+    DDT->deleteBB(BB); // Deferred deletion of the old basic block.
+    DDT->applyUpdates(Updates);
+  } else {
+    BB->eraseFromParent(); // Delete the old basic block.
+  }
   return true;
 }
 
@@ -1129,6 +1228,31 @@ static bool PhiHasDebugValue(DILocalVariable *DIVar,
   return false;
 }
 
+/// Check if the alloc size of \p ValTy is large enough to cover the variable
+/// (or fragment of the variable) described by \p DII.
+///
+/// This is primarily intended as a helper for the different
+/// ConvertDebugDeclareToDebugValue functions. The dbg.declare/dbg.addr that is
+/// converted describes an alloca'd variable, so we need to use the
+/// alloc size of the value when doing the comparison. E.g. an i1 value will be
+/// identified as covering an n-bit fragment, if the store size of i1 is at
+/// least n bits.
+static bool valueCoversEntireFragment(Type *ValTy, DbgInfoIntrinsic *DII) {
+  const DataLayout &DL = DII->getModule()->getDataLayout();
+  uint64_t ValueSize = DL.getTypeAllocSizeInBits(ValTy);
+  if (auto FragmentSize = DII->getFragmentSizeInBits())
+    return ValueSize >= *FragmentSize;
+  // We can't always calculate the size of the DI variable (e.g. if it is a
+  // VLA). Try to use the size of the alloca that the dbg intrinsic describes
+  // intead.
+  if (DII->isAddressOfVariable())
+    if (auto *AI = dyn_cast_or_null<AllocaInst>(DII->getVariableLocation()))
+      if (auto FragmentSize = AI->getAllocationSizeInBits(DL))
+        return ValueSize >= *FragmentSize;
+  // Could not determine size of variable. Conservatively return false.
+  return false;
+}
+
 /// Inserts a llvm.dbg.value intrinsic before a store to an alloca'd value
 /// that has an associated llvm.dbg.declare or llvm.dbg.addr intrinsic.
 void llvm::ConvertDebugDeclareToDebugValue(DbgInfoIntrinsic *DII,
@@ -1139,6 +1263,21 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgInfoIntrinsic *DII,
   auto *DIExpr = DII->getExpression();
   Value *DV = SI->getOperand(0);
 
+  if (!valueCoversEntireFragment(SI->getValueOperand()->getType(), DII)) {
+    // FIXME: If storing to a part of the variable described by the dbg.declare,
+    // then we want to insert a dbg.value for the corresponding fragment.
+    LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: "
+                      << *DII << '\n');
+    // For now, when there is a store to parts of the variable (but we do not
+    // know which part) we insert an dbg.value instrinsic to indicate that we
+    // know nothing about the variable's content.
+    DV = UndefValue::get(DV->getType());
+    if (!LdStHasDebugValue(DIVar, DIExpr, SI))
+      Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, DII->getDebugLoc(),
+                                      SI);
+    return;
+  }
+
   // If an argument is zero extended then use argument directly. The ZExt
   // may be zapped by an optimization pass in future.
   Argument *ExtendedArg = nullptr;
@@ -1182,6 +1321,15 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgInfoIntrinsic *DII,
   if (LdStHasDebugValue(DIVar, DIExpr, LI))
     return;
 
+  if (!valueCoversEntireFragment(LI->getType(), DII)) {
+    // FIXME: If only referring to a part of the variable described by the
+    // dbg.declare, then we want to insert a dbg.value for the corresponding
+    // fragment.
+    LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: "
+                      << *DII << '\n');
+    return;
+  }
+
   // We are now tracking the loaded value instead of the address. In the
   // future if multi-location support is added to the IR, it might be
   // preferable to keep tracking both the loaded value and the original
@@ -1202,6 +1350,15 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgInfoIntrinsic *DII,
   if (PhiHasDebugValue(DIVar, DIExpr, APN))
     return;
 
+  if (!valueCoversEntireFragment(APN->getType(), DII)) {
+    // FIXME: If only referring to a part of the variable described by the
+    // dbg.declare, then we want to insert a dbg.value for the corresponding
+    // fragment.
+    LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: "
+                      << *DII << '\n');
+    return;
+  }
+
   BasicBlock *BB = APN->getParent();
   auto InsertionPt = BB->getFirstInsertionPt();
 
@@ -1241,33 +1398,91 @@ bool llvm::LowerDbgDeclare(Function &F) {
     // stored on the stack, while the dbg.declare can only describe
     // the stack slot (and at a lexical-scope granularity). Later
     // passes will attempt to elide the stack slot.
-    if (AI && !isArray(AI)) {
-      for (auto &AIUse : AI->uses()) {
-        User *U = AIUse.getUser();
-        if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
-          if (AIUse.getOperandNo() == 1)
-            ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
-        } else if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
-          ConvertDebugDeclareToDebugValue(DDI, LI, DIB);
-        } else if (CallInst *CI = dyn_cast<CallInst>(U)) {
-          // This is a call by-value or some other instruction that
-          // takes a pointer to the variable. Insert a *value*
-          // intrinsic that describes the alloca.
-          DIB.insertDbgValueIntrinsic(AI, DDI->getVariable(),
-                                      DDI->getExpression(), DDI->getDebugLoc(),
-                                      CI);
-        }
+    if (!AI || isArray(AI))
+      continue;
+
+    // A volatile load/store means that the alloca can't be elided anyway.
+    if (llvm::any_of(AI->users(), [](User *U) -> bool {
+          if (LoadInst *LI = dyn_cast<LoadInst>(U))
+            return LI->isVolatile();
+          if (StoreInst *SI = dyn_cast<StoreInst>(U))
+            return SI->isVolatile();
+          return false;
+        }))
+      continue;
+
+    for (auto &AIUse : AI->uses()) {
+      User *U = AIUse.getUser();
+      if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+        if (AIUse.getOperandNo() == 1)
+          ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
+        ConvertDebugDeclareToDebugValue(DDI, LI, DIB);
+      } else if (CallInst *CI = dyn_cast<CallInst>(U)) {
+        // This is a call by-value or some other instruction that takes a
+        // pointer to the variable. Insert a *value* intrinsic that describes
+        // the variable by dereferencing the alloca.
+        auto *DerefExpr =
+            DIExpression::append(DDI->getExpression(), dwarf::DW_OP_deref);
+        DIB.insertDbgValueIntrinsic(AI, DDI->getVariable(), DerefExpr,
+                                    DDI->getDebugLoc(), CI);
       }
-      DDI->eraseFromParent();
     }
+    DDI->eraseFromParent();
   }
   return true;
 }
 
+/// Propagate dbg.value intrinsics through the newly inserted PHIs.
+void llvm::insertDebugValuesForPHIs(BasicBlock *BB,
+                                    SmallVectorImpl<PHINode *> &InsertedPHIs) {
+  assert(BB && "No BasicBlock to clone dbg.value(s) from.");
+  if (InsertedPHIs.size() == 0)
+    return;
+
+  // Map existing PHI nodes to their dbg.values.
+  ValueToValueMapTy DbgValueMap;
+  for (auto &I : *BB) {
+    if (auto DbgII = dyn_cast<DbgInfoIntrinsic>(&I)) {
+      if (auto *Loc = dyn_cast_or_null<PHINode>(DbgII->getVariableLocation()))
+        DbgValueMap.insert({Loc, DbgII});
+    }
+  }
+  if (DbgValueMap.size() == 0)
+    return;
+
+  // Then iterate through the new PHIs and look to see if they use one of the
+  // previously mapped PHIs. If so, insert a new dbg.value intrinsic that will
+  // propagate the info through the new PHI.
+  LLVMContext &C = BB->getContext();
+  for (auto PHI : InsertedPHIs) {
+    BasicBlock *Parent = PHI->getParent();
+    // Avoid inserting an intrinsic into an EH block.
+    if (Parent->getFirstNonPHI()->isEHPad())
+      continue;
+    auto PhiMAV = MetadataAsValue::get(C, ValueAsMetadata::get(PHI));
+    for (auto VI : PHI->operand_values()) {
+      auto V = DbgValueMap.find(VI);
+      if (V != DbgValueMap.end()) {
+        auto *DbgII = cast<DbgInfoIntrinsic>(V->second);
+        Instruction *NewDbgII = DbgII->clone();
+        NewDbgII->setOperand(0, PhiMAV);
+        auto InsertionPt = Parent->getFirstInsertionPt();
+        assert(InsertionPt != Parent->end() && "Ill-formed basic block");
+        NewDbgII->insertBefore(&*InsertionPt);
+      }
+    }
+  }
+}
+
 /// Finds all intrinsics declaring local variables as living in the memory that
 /// 'V' points to. This may include a mix of dbg.declare and
 /// dbg.addr intrinsics.
 TinyPtrVector<DbgInfoIntrinsic *> llvm::FindDbgAddrUses(Value *V) {
+  // This function is hot. Check whether the value has any metadata to avoid a
+  // DenseMap lookup.
+  if (!V->isUsedByMetadata())
+    return {};
   auto *L = LocalAsMetadata::getIfExists(V);
   if (!L)
     return {};
@@ -1286,6 +1501,10 @@ TinyPtrVector<DbgInfoIntrinsic *> llvm::FindDbgAddrUses(Value *V) {
 }
 
 void llvm::findDbgValues(SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V) {
+  // This function is hot. Check whether the value has any metadata to avoid a
+  // DenseMap lookup.
+  if (!V->isUsedByMetadata())
+    return;
   if (auto *L = LocalAsMetadata::getIfExists(V))
     if (auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L))
       for (User *U : MDV->users())
@@ -1293,8 +1512,12 @@ void llvm::findDbgValues(SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V) {
           DbgValues.push_back(DVI);
 }
 
-static void findDbgUsers(SmallVectorImpl<DbgInfoIntrinsic *> &DbgUsers,
-                         Value *V) {
+void llvm::findDbgUsers(SmallVectorImpl<DbgInfoIntrinsic *> &DbgUsers,
+                        Value *V) {
+  // This function is hot. Check whether the value has any metadata to avoid a
+  // DenseMap lookup.
+  if (!V->isUsedByMetadata())
+    return;
   if (auto *L = LocalAsMetadata::getIfExists(V))
     if (auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L))
       for (User *U : MDV->users())
@@ -1312,11 +1535,11 @@ bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
     auto *DIExpr = DII->getExpression();
     assert(DIVar && "Missing variable");
     DIExpr = DIExpression::prepend(DIExpr, DerefBefore, Offset, DerefAfter);
-    // Insert llvm.dbg.declare immediately after InsertBefore, and remove old
+    // Insert llvm.dbg.declare immediately before InsertBefore, and remove old
     // llvm.dbg.declare.
     Builder.insertDeclare(NewAddress, DIVar, DIExpr, Loc, InsertBefore);
     if (DII == InsertBefore)
-      InsertBefore = &*std::next(InsertBefore->getIterator());
+      InsertBefore = InsertBefore->getNextNode();
     DII->eraseFromParent();
   }
   return !DbgAddrs.empty();
@@ -1368,66 +1591,293 @@ void llvm::replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
       }
 }
 
-void llvm::salvageDebugInfo(Instruction &I) {
-  SmallVector<DbgValueInst *, 1> DbgValues;
+/// Wrap \p V in a ValueAsMetadata instance.
+static MetadataAsValue *wrapValueInMetadata(LLVMContext &C, Value *V) {
+  return MetadataAsValue::get(C, ValueAsMetadata::get(V));
+}
+
+bool llvm::salvageDebugInfo(Instruction &I) {
+  SmallVector<DbgInfoIntrinsic *, 1> DbgUsers;
+  findDbgUsers(DbgUsers, &I);
+  if (DbgUsers.empty())
+    return false;
+
   auto &M = *I.getModule();
+  auto &DL = M.getDataLayout();
+  auto &Ctx = I.getContext();
+  auto wrapMD = [&](Value *V) { return wrapValueInMetadata(Ctx, V); };
 
-  auto wrapMD = [&](Value *V) {
-    return MetadataAsValue::get(I.getContext(), ValueAsMetadata::get(V));
+  auto doSalvage = [&](DbgInfoIntrinsic *DII, SmallVectorImpl<uint64_t> &Ops) {
+    auto *DIExpr = DII->getExpression();
+    if (!Ops.empty()) {
+      // Do not add DW_OP_stack_value for DbgDeclare and DbgAddr, because they
+      // are implicitly pointing out the value as a DWARF memory location
+      // description.
+      bool WithStackValue = isa<DbgValueInst>(DII);
+      DIExpr = DIExpression::prependOpcodes(DIExpr, Ops, WithStackValue);
+    }
+    DII->setOperand(0, wrapMD(I.getOperand(0)));
+    DII->setOperand(2, MetadataAsValue::get(Ctx, DIExpr));
+    LLVM_DEBUG(dbgs() << "SALVAGE: " << *DII << '\n');
   };
 
-  auto applyOffset = [&](DbgValueInst *DVI, uint64_t Offset) {
-    auto *DIExpr = DVI->getExpression();
-    DIExpr = DIExpression::prepend(DIExpr, DIExpression::NoDeref, Offset,
-                                   DIExpression::NoDeref,
-                                   DIExpression::WithStackValue);
-    DVI->setOperand(0, wrapMD(I.getOperand(0)));
-    DVI->setOperand(2, MetadataAsValue::get(I.getContext(), DIExpr));
-    DEBUG(dbgs() << "SALVAGE: " << *DVI << '\n');
+  auto applyOffset = [&](DbgInfoIntrinsic *DII, uint64_t Offset) {
+    SmallVector<uint64_t, 8> Ops;
+    DIExpression::appendOffset(Ops, Offset);
+    doSalvage(DII, Ops);
   };
 
-  if (isa<BitCastInst>(&I) || isa<IntToPtrInst>(&I)) {
-    // Bitcasts are entirely irrelevant for debug info. Rewrite dbg.value,
-    // dbg.addr, and dbg.declare to use the cast's source.
-    SmallVector<DbgInfoIntrinsic *, 1> DbgUsers;
-    findDbgUsers(DbgUsers, &I);
+  auto applyOps = [&](DbgInfoIntrinsic *DII,
+                      std::initializer_list<uint64_t> Opcodes) {
+    SmallVector<uint64_t, 8> Ops(Opcodes);
+    doSalvage(DII, Ops);
+  };
+
+  if (auto *CI = dyn_cast<CastInst>(&I)) {
+    if (!CI->isNoopCast(DL))
+      return false;
+
+    // No-op casts are irrelevant for debug info.
+    MetadataAsValue *CastSrc = wrapMD(I.getOperand(0));
     for (auto *DII : DbgUsers) {
-      DII->setOperand(0, wrapMD(I.getOperand(0)));
-      DEBUG(dbgs() << "SALVAGE: " << *DII << '\n');
+      DII->setOperand(0, CastSrc);
+      LLVM_DEBUG(dbgs() << "SALVAGE: " << *DII << '\n');
     }
+    return true;
   } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
-    findDbgValues(DbgValues, &I);
-    for (auto *DVI : DbgValues) {
-      unsigned BitWidth =
-          M.getDataLayout().getPointerSizeInBits(GEP->getPointerAddressSpace());
-      APInt Offset(BitWidth, 0);
-      // Rewrite a constant GEP into a DIExpression.  Since we are performing
-      // arithmetic to compute the variable's *value* in the DIExpression, we
-      // need to mark the expression with a DW_OP_stack_value.
-      if (GEP->accumulateConstantOffset(M.getDataLayout(), Offset))
-        // GEP offsets are i32 and thus always fit into an int64_t.
-        applyOffset(DVI, Offset.getSExtValue());
-    }
+    unsigned BitWidth =
+        M.getDataLayout().getIndexSizeInBits(GEP->getPointerAddressSpace());
+    // Rewrite a constant GEP into a DIExpression.  Since we are performing
+    // arithmetic to compute the variable's *value* in the DIExpression, we
+    // need to mark the expression with a DW_OP_stack_value.
+    APInt Offset(BitWidth, 0);
+    if (GEP->accumulateConstantOffset(M.getDataLayout(), Offset))
+      for (auto *DII : DbgUsers)
+        applyOffset(DII, Offset.getSExtValue());
+    return true;
   } else if (auto *BI = dyn_cast<BinaryOperator>(&I)) {
-    if (BI->getOpcode() == Instruction::Add)
-      if (auto *ConstInt = dyn_cast<ConstantInt>(I.getOperand(1)))
-        if (ConstInt->getBitWidth() <= 64) {
-          APInt Offset = ConstInt->getValue();
-          findDbgValues(DbgValues, &I);
-          for (auto *DVI : DbgValues)
-            applyOffset(DVI, Offset.getSExtValue());
-        }
+    // Rewrite binary operations with constant integer operands.
+    auto *ConstInt = dyn_cast<ConstantInt>(I.getOperand(1));
+    if (!ConstInt || ConstInt->getBitWidth() > 64)
+      return false;
+
+    uint64_t Val = ConstInt->getSExtValue();
+    for (auto *DII : DbgUsers) {
+      switch (BI->getOpcode()) {
+      case Instruction::Add:
+        applyOffset(DII, Val);
+        break;
+      case Instruction::Sub:
+        applyOffset(DII, -int64_t(Val));
+        break;
+      case Instruction::Mul:
+        applyOps(DII, {dwarf::DW_OP_constu, Val, dwarf::DW_OP_mul});
+        break;
+      case Instruction::SDiv:
+        applyOps(DII, {dwarf::DW_OP_constu, Val, dwarf::DW_OP_div});
+        break;
+      case Instruction::SRem:
+        applyOps(DII, {dwarf::DW_OP_constu, Val, dwarf::DW_OP_mod});
+        break;
+      case Instruction::Or:
+        applyOps(DII, {dwarf::DW_OP_constu, Val, dwarf::DW_OP_or});
+        break;
+      case Instruction::And:
+        applyOps(DII, {dwarf::DW_OP_constu, Val, dwarf::DW_OP_and});
+        break;
+      case Instruction::Xor:
+        applyOps(DII, {dwarf::DW_OP_constu, Val, dwarf::DW_OP_xor});
+        break;
+      case Instruction::Shl:
+        applyOps(DII, {dwarf::DW_OP_constu, Val, dwarf::DW_OP_shl});
+        break;
+      case Instruction::LShr:
+        applyOps(DII, {dwarf::DW_OP_constu, Val, dwarf::DW_OP_shr});
+        break;
+      case Instruction::AShr:
+        applyOps(DII, {dwarf::DW_OP_constu, Val, dwarf::DW_OP_shra});
+        break;
+      default:
+        // TODO: Salvage constants from each kind of binop we know about.
+        return false;
+      }
+    }
+    return true;
   } else if (isa<LoadInst>(&I)) {
-    findDbgValues(DbgValues, &I);
-    for (auto *DVI : DbgValues) {
+    MetadataAsValue *AddrMD = wrapMD(I.getOperand(0));
+    for (auto *DII : DbgUsers) {
       // Rewrite the load into DW_OP_deref.
-      auto *DIExpr = DVI->getExpression();
+      auto *DIExpr = DII->getExpression();
       DIExpr = DIExpression::prepend(DIExpr, DIExpression::WithDeref);
-      DVI->setOperand(0, wrapMD(I.getOperand(0)));
-      DVI->setOperand(2, MetadataAsValue::get(I.getContext(), DIExpr));
-      DEBUG(dbgs() << "SALVAGE:  " << *DVI << '\n');
+      DII->setOperand(0, AddrMD);
+      DII->setOperand(2, MetadataAsValue::get(Ctx, DIExpr));
+      LLVM_DEBUG(dbgs() << "SALVAGE:  " << *DII << '\n');
+    }
+    return true;
+  }
+  return false;
+}
+
+/// A replacement for a dbg.value expression.
+using DbgValReplacement = Optional<DIExpression *>;
+
+/// Point debug users of \p From to \p To using exprs given by \p RewriteExpr,
+/// possibly moving/deleting users to prevent use-before-def. Returns true if
+/// changes are made.
+static bool rewriteDebugUsers(
+    Instruction &From, Value &To, Instruction &DomPoint, DominatorTree &DT,
+    function_ref<DbgValReplacement(DbgInfoIntrinsic &DII)> RewriteExpr) {
+  // Find debug users of From.
+  SmallVector<DbgInfoIntrinsic *, 1> Users;
+  findDbgUsers(Users, &From);
+  if (Users.empty())
+    return false;
+
+  // Prevent use-before-def of To.
+  bool Changed = false;
+  SmallPtrSet<DbgInfoIntrinsic *, 1> DeleteOrSalvage;
+  if (isa<Instruction>(&To)) {
+    bool DomPointAfterFrom = From.getNextNonDebugInstruction() == &DomPoint;
+
+    for (auto *DII : Users) {
+      // It's common to see a debug user between From and DomPoint. Move it
+      // after DomPoint to preserve the variable update without any reordering.
+      if (DomPointAfterFrom && DII->getNextNonDebugInstruction() == &DomPoint) {
+        LLVM_DEBUG(dbgs() << "MOVE:  " << *DII << '\n');
+        DII->moveAfter(&DomPoint);
+        Changed = true;
+
+      // Users which otherwise aren't dominated by the replacement value must
+      // be salvaged or deleted.
+      } else if (!DT.dominates(&DomPoint, DII)) {
+        DeleteOrSalvage.insert(DII);
+      }
     }
   }
+
+  // Update debug users without use-before-def risk.
+  for (auto *DII : Users) {
+    if (DeleteOrSalvage.count(DII))
+      continue;
+
+    LLVMContext &Ctx = DII->getContext();
+    DbgValReplacement DVR = RewriteExpr(*DII);
+    if (!DVR)
+      continue;
+
+    DII->setOperand(0, wrapValueInMetadata(Ctx, &To));
+    DII->setOperand(2, MetadataAsValue::get(Ctx, *DVR));
+    LLVM_DEBUG(dbgs() << "REWRITE:  " << *DII << '\n');
+    Changed = true;
+  }
+
+  if (!DeleteOrSalvage.empty()) {
+    // Try to salvage the remaining debug users.
+    Changed |= salvageDebugInfo(From);
+
+    // Delete the debug users which weren't salvaged.
+    for (auto *DII : DeleteOrSalvage) {
+      if (DII->getVariableLocation() == &From) {
+        LLVM_DEBUG(dbgs() << "Erased UseBeforeDef:  " << *DII << '\n');
+        DII->eraseFromParent();
+        Changed = true;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+/// Check if a bitcast between a value of type \p FromTy to type \p ToTy would
+/// losslessly preserve the bits and semantics of the value. This predicate is
+/// symmetric, i.e swapping \p FromTy and \p ToTy should give the same result.
+///
+/// Note that Type::canLosslesslyBitCastTo is not suitable here because it
+/// allows semantically unequivalent bitcasts, such as <2 x i64> -> <4 x i32>,
+/// and also does not allow lossless pointer <-> integer conversions.
+static bool isBitCastSemanticsPreserving(const DataLayout &DL, Type *FromTy,
+                                         Type *ToTy) {
+  // Trivially compatible types.
+  if (FromTy == ToTy)
+    return true;
+
+  // Handle compatible pointer <-> integer conversions.
+  if (FromTy->isIntOrPtrTy() && ToTy->isIntOrPtrTy()) {
+    bool SameSize = DL.getTypeSizeInBits(FromTy) == DL.getTypeSizeInBits(ToTy);
+    bool LosslessConversion = !DL.isNonIntegralPointerType(FromTy) &&
+                              !DL.isNonIntegralPointerType(ToTy);
+    return SameSize && LosslessConversion;
+  }
+
+  // TODO: This is not exhaustive.
+  return false;
+}
+
+bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To,
+                                 Instruction &DomPoint, DominatorTree &DT) {
+  // Exit early if From has no debug users.
+  if (!From.isUsedByMetadata())
+    return false;
+
+  assert(&From != &To && "Can't replace something with itself");
+
+  Type *FromTy = From.getType();
+  Type *ToTy = To.getType();
+
+  auto Identity = [&](DbgInfoIntrinsic &DII) -> DbgValReplacement {
+    return DII.getExpression();
+  };
+
+  // Handle no-op conversions.
+  Module &M = *From.getModule();
+  const DataLayout &DL = M.getDataLayout();
+  if (isBitCastSemanticsPreserving(DL, FromTy, ToTy))
+    return rewriteDebugUsers(From, To, DomPoint, DT, Identity);
+
+  // Handle integer-to-integer widening and narrowing.
+  // FIXME: Use DW_OP_convert when it's available everywhere.
+  if (FromTy->isIntegerTy() && ToTy->isIntegerTy()) {
+    uint64_t FromBits = FromTy->getPrimitiveSizeInBits();
+    uint64_t ToBits = ToTy->getPrimitiveSizeInBits();
+    assert(FromBits != ToBits && "Unexpected no-op conversion");
+
+    // When the width of the result grows, assume that a debugger will only
+    // access the low `FromBits` bits when inspecting the source variable.
+    if (FromBits < ToBits)
+      return rewriteDebugUsers(From, To, DomPoint, DT, Identity);
+
+    // The width of the result has shrunk. Use sign/zero extension to describe
+    // the source variable's high bits.
+    auto SignOrZeroExt = [&](DbgInfoIntrinsic &DII) -> DbgValReplacement {
+      DILocalVariable *Var = DII.getVariable();
+
+      // Without knowing signedness, sign/zero extension isn't possible.
+      auto Signedness = Var->getSignedness();
+      if (!Signedness)
+        return None;
+
+      bool Signed = *Signedness == DIBasicType::Signedness::Signed;
+
+      if (!Signed) {
+        // In the unsigned case, assume that a debugger will initialize the
+        // high bits to 0 and do a no-op conversion.
+        return Identity(DII);
+      } else {
+        // In the signed case, the high bits are given by sign extension, i.e:
+        //   (To >> (ToBits - 1)) * ((2 ^ FromBits) - 1)
+        // Calculate the high bits and OR them together with the low bits.
+        SmallVector<uint64_t, 8> Ops({dwarf::DW_OP_dup, dwarf::DW_OP_constu,
+                                      (ToBits - 1), dwarf::DW_OP_shr,
+                                      dwarf::DW_OP_lit0, dwarf::DW_OP_not,
+                                      dwarf::DW_OP_mul, dwarf::DW_OP_or});
+        return DIExpression::appendToStack(DII.getExpression(), Ops);
+      }
+    };
+    return rewriteDebugUsers(From, To, DomPoint, DT, SignOrZeroExt);
+  }
+
+  // TODO: Floating-point conversions, vectors.
+  return false;
 }
 
 unsigned llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
@@ -1452,13 +1902,19 @@ unsigned llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
 }
 
 unsigned llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap,
-                                   bool PreserveLCSSA) {
+                                   bool PreserveLCSSA, DeferredDominance *DDT) {
   BasicBlock *BB = I->getParent();
+  std::vector <DominatorTree::UpdateType> Updates;
+
   // Loop over all of the successors, removing BB's entry from any PHI
   // nodes.
-  for (BasicBlock *Successor : successors(BB))
+  if (DDT)
+    Updates.reserve(BB->getTerminator()->getNumSuccessors());
+  for (BasicBlock *Successor : successors(BB)) {
     Successor->removePredecessor(BB, PreserveLCSSA);
-
+    if (DDT)
+      Updates.push_back({DominatorTree::Delete, BB, Successor});
+  }
   // Insert a call to llvm.trap right before this.  This turns the undefined
   // behavior into a hard fail instead of falling through into random code.
   if (UseLLVMTrap) {
@@ -1478,11 +1934,13 @@ unsigned llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap,
     BB->getInstList().erase(BBI++);
     ++NumInstrsRemoved;
   }
+  if (DDT)
+    DDT->applyUpdates(Updates);
   return NumInstrsRemoved;
 }
 
 /// changeToCall - Convert the specified invoke into a normal call.
-static void changeToCall(InvokeInst *II) {
+static void changeToCall(InvokeInst *II, DeferredDominance *DDT = nullptr) {
   SmallVector<Value*, 8> Args(II->arg_begin(), II->arg_end());
   SmallVector<OperandBundleDef, 1> OpBundles;
   II->getOperandBundlesAsDefs(OpBundles);
@@ -1495,11 +1953,16 @@ static void changeToCall(InvokeInst *II) {
   II->replaceAllUsesWith(NewCall);
 
   // Follow the call by a branch to the normal destination.
-  BranchInst::Create(II->getNormalDest(), II);
+  BasicBlock *NormalDestBB = II->getNormalDest();
+  BranchInst::Create(NormalDestBB, II);
 
   // Update PHI nodes in the unwind destination
-  II->getUnwindDest()->removePredecessor(II->getParent());
+  BasicBlock *BB = II->getParent();
+  BasicBlock *UnwindDestBB = II->getUnwindDest();
+  UnwindDestBB->removePredecessor(BB);
   II->eraseFromParent();
+  if (DDT)
+    DDT->deleteEdge(BB, UnwindDestBB);
 }
 
 BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI,
@@ -1540,7 +2003,8 @@ BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI,
 }
 
 static bool markAliveBlocks(Function &F,
-                            SmallPtrSetImpl<BasicBlock*> &Reachable) {
+                            SmallPtrSetImpl<BasicBlock*> &Reachable,
+                            DeferredDominance *DDT = nullptr) {
   SmallVector<BasicBlock*, 128> Worklist;
   BasicBlock *BB = &F.front();
   Worklist.push_back(BB);
@@ -1553,41 +2017,44 @@ static bool markAliveBlocks(Function &F,
     // instructions into LLVM unreachable insts.  The instruction combining pass
     // canonicalizes unreachable insts into stores to null or undef.
     for (Instruction &I : *BB) {
-      // Assumptions that are known to be false are equivalent to unreachable.
-      // Also, if the condition is undefined, then we make the choice most
-      // beneficial to the optimizer, and choose that to also be unreachable.
-      if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
-        if (II->getIntrinsicID() == Intrinsic::assume) {
-          if (match(II->getArgOperand(0), m_CombineOr(m_Zero(), m_Undef()))) {
-            // Don't insert a call to llvm.trap right before the unreachable.
-            changeToUnreachable(II, false);
-            Changed = true;
-            break;
-          }
-        }
-
-        if (II->getIntrinsicID() == Intrinsic::experimental_guard) {
-          // A call to the guard intrinsic bails out of the current compilation
-          // unit if the predicate passed to it is false.  If the predicate is a
-          // constant false, then we know the guard will bail out of the current
-          // compile unconditionally, so all code following it is dead.
-          //
-          // Note: unlike in llvm.assume, it is not "obviously profitable" for
-          // guards to treat `undef` as `false` since a guard on `undef` can
-          // still be useful for widening.
-          if (match(II->getArgOperand(0), m_Zero()))
-            if (!isa<UnreachableInst>(II->getNextNode())) {
-              changeToUnreachable(II->getNextNode(), /*UseLLVMTrap=*/ false);
+      if (auto *CI = dyn_cast<CallInst>(&I)) {
+        Value *Callee = CI->getCalledValue();
+        // Handle intrinsic calls.
+        if (Function *F = dyn_cast<Function>(Callee)) {
+          auto IntrinsicID = F->getIntrinsicID();
+          // Assumptions that are known to be false are equivalent to
+          // unreachable. Also, if the condition is undefined, then we make the
+          // choice most beneficial to the optimizer, and choose that to also be
+          // unreachable.
+          if (IntrinsicID == Intrinsic::assume) {
+            if (match(CI->getArgOperand(0), m_CombineOr(m_Zero(), m_Undef()))) {
+              // Don't insert a call to llvm.trap right before the unreachable.
+              changeToUnreachable(CI, false, false, DDT);
               Changed = true;
               break;
             }
-        }
-      }
-
-      if (auto *CI = dyn_cast<CallInst>(&I)) {
-        Value *Callee = CI->getCalledValue();
-        if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
-          changeToUnreachable(CI, /*UseLLVMTrap=*/false);
+          } else if (IntrinsicID == Intrinsic::experimental_guard) {
+            // A call to the guard intrinsic bails out of the current
+            // compilation unit if the predicate passed to it is false. If the
+            // predicate is a constant false, then we know the guard will bail
+            // out of the current compile unconditionally, so all code following
+            // it is dead.
+            //
+            // Note: unlike in llvm.assume, it is not "obviously profitable" for
+            // guards to treat `undef` as `false` since a guard on `undef` can
+            // still be useful for widening.
+            if (match(CI->getArgOperand(0), m_Zero()))
+              if (!isa<UnreachableInst>(CI->getNextNode())) {
+                changeToUnreachable(CI->getNextNode(), /*UseLLVMTrap=*/false,
+                                    false, DDT);
+                Changed = true;
+                break;
+              }
+          }
+        } else if ((isa<ConstantPointerNull>(Callee) &&
+                    !NullPointerIsDefined(CI->getFunction())) ||
+                   isa<UndefValue>(Callee)) {
+          changeToUnreachable(CI, /*UseLLVMTrap=*/false, false, DDT);
           Changed = true;
           break;
         }
@@ -1597,17 +2064,16 @@ static bool markAliveBlocks(Function &F,
           // though.
           if (!isa<UnreachableInst>(CI->getNextNode())) {
             // Don't insert a call to llvm.trap right before the unreachable.
-            changeToUnreachable(CI->getNextNode(), false);
+            changeToUnreachable(CI->getNextNode(), false, false, DDT);
             Changed = true;
           }
           break;
         }
-      }
+      } else if (auto *SI = dyn_cast<StoreInst>(&I)) {
+        // Store to undef and store to null are undefined and used to signal
+        // that they should be changed to unreachable by passes that can't
+        // modify the CFG.
 
-      // Store to undef and store to null are undefined and used to signal that
-      // they should be changed to unreachable by passes that can't modify the
-      // CFG.
-      if (auto *SI = dyn_cast<StoreInst>(&I)) {
         // Don't touch volatile stores.
         if (SI->isVolatile()) continue;
 
@@ -1615,8 +2081,9 @@ static bool markAliveBlocks(Function &F,
 
         if (isa<UndefValue>(Ptr) ||
             (isa<ConstantPointerNull>(Ptr) &&
-             SI->getPointerAddressSpace() == 0)) {
-          changeToUnreachable(SI, true);
+             !NullPointerIsDefined(SI->getFunction(),
+                                   SI->getPointerAddressSpace()))) {
+          changeToUnreachable(SI, true, false, DDT);
           Changed = true;
           break;
         }
@@ -1627,17 +2094,23 @@ static bool markAliveBlocks(Function &F,
     if (auto *II = dyn_cast<InvokeInst>(Terminator)) {
       // Turn invokes that call 'nounwind' functions into ordinary calls.
       Value *Callee = II->getCalledValue();
-      if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
-        changeToUnreachable(II, true);
+      if ((isa<ConstantPointerNull>(Callee) &&
+           !NullPointerIsDefined(BB->getParent())) ||
+          isa<UndefValue>(Callee)) {
+        changeToUnreachable(II, true, false, DDT);
         Changed = true;
       } else if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(&F)) {
         if (II->use_empty() && II->onlyReadsMemory()) {
           // jump to the normal destination branch.
-          BranchInst::Create(II->getNormalDest(), II);
-          II->getUnwindDest()->removePredecessor(II->getParent());
+          BasicBlock *NormalDestBB = II->getNormalDest();
+          BasicBlock *UnwindDestBB = II->getUnwindDest();
+          BranchInst::Create(NormalDestBB, II);
+          UnwindDestBB->removePredecessor(II->getParent());
           II->eraseFromParent();
+          if (DDT)
+            DDT->deleteEdge(BB, UnwindDestBB);
         } else
-          changeToCall(II);
+          changeToCall(II, DDT);
         Changed = true;
       }
     } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Terminator)) {
@@ -1683,7 +2156,7 @@ static bool markAliveBlocks(Function &F,
       }
     }
 
-    Changed |= ConstantFoldTerminator(BB, true);
+    Changed |= ConstantFoldTerminator(BB, true, nullptr, DDT);
     for (BasicBlock *Successor : successors(BB))
       if (Reachable.insert(Successor).second)
         Worklist.push_back(Successor);
@@ -1691,11 +2164,11 @@ static bool markAliveBlocks(Function &F,
   return Changed;
 }
 
-void llvm::removeUnwindEdge(BasicBlock *BB) {
+void llvm::removeUnwindEdge(BasicBlock *BB, DeferredDominance *DDT) {
   TerminatorInst *TI = BB->getTerminator();
 
   if (auto *II = dyn_cast<InvokeInst>(TI)) {
-    changeToCall(II);
+    changeToCall(II, DDT);
     return;
   }
 
@@ -1723,15 +2196,18 @@ void llvm::removeUnwindEdge(BasicBlock *BB) {
   UnwindDest->removePredecessor(BB);
   TI->replaceAllUsesWith(NewTI);
   TI->eraseFromParent();
+  if (DDT)
+    DDT->deleteEdge(BB, UnwindDest);
 }
 
 /// removeUnreachableBlocks - Remove blocks that are not reachable, even
 /// if they are in a dead cycle.  Return true if a change was made, false
 /// otherwise. If `LVI` is passed, this function preserves LazyValueInfo
 /// after modifying the CFG.
-bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI) {
+bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI,
+                                   DeferredDominance *DDT) {
   SmallPtrSet<BasicBlock*, 16> Reachable;
-  bool Changed = markAliveBlocks(F, Reachable);
+  bool Changed = markAliveBlocks(F, Reachable, DDT);
 
   // If there are unreachable blocks in the CFG...
   if (Reachable.size() == F.size())
@@ -1741,25 +2217,39 @@ bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI) {
   NumRemoved += F.size()-Reachable.size();
 
   // Loop over all of the basic blocks that are not reachable, dropping all of
-  // their internal references...
-  for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) {
-    if (Reachable.count(&*BB))
+  // their internal references. Update DDT and LVI if available.
+  std::vector <DominatorTree::UpdateType> Updates;
+  for (Function::iterator I = ++F.begin(), E = F.end(); I != E; ++I) {
+    auto *BB = &*I;
+    if (Reachable.count(BB))
       continue;
-
-    for (BasicBlock *Successor : successors(&*BB))
+    for (BasicBlock *Successor : successors(BB)) {
       if (Reachable.count(Successor))
-        Successor->removePredecessor(&*BB);
+        Successor->removePredecessor(BB);
+      if (DDT)
+        Updates.push_back({DominatorTree::Delete, BB, Successor});
+    }
     if (LVI)
-      LVI->eraseBlock(&*BB);
+      LVI->eraseBlock(BB);
     BB->dropAllReferences();
   }
 
-  for (Function::iterator I = ++F.begin(); I != F.end();)
-    if (!Reachable.count(&*I))
-      I = F.getBasicBlockList().erase(I);
-    else
+  for (Function::iterator I = ++F.begin(); I != F.end();) {
+    auto *BB = &*I;
+    if (Reachable.count(BB)) {
       ++I;
+      continue;
+    }
+    if (DDT) {
+      DDT->deleteBB(BB); // deferred deletion of BB.
+      ++I;
+    } else {
+      I = F.getBasicBlockList().erase(I);
+    }
+  }
 
+  if (DDT)
+    DDT->applyUpdates(Updates);
   return true;
 }
 
@@ -1852,8 +2342,8 @@ static unsigned replaceDominatedUsesWith(Value *From, Value *To,
     if (!Dominates(Root, U))
       continue;
     U.set(To);
-    DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as "
-                 << *To << " in " << *U << "\n");
+    LLVM_DEBUG(dbgs() << "Replace dominated use of '" << From->getName()
+                      << "' as " << *To << " in " << *U << "\n");
     ++Count;
   }
   return Count;
@@ -1957,7 +2447,7 @@ void llvm::copyRangeMetadata(const DataLayout &DL, const LoadInst &OldLI,
   if (!NewTy->isPointerTy())
     return;
 
-  unsigned BitWidth = DL.getTypeSizeInBits(NewTy);
+  unsigned BitWidth = DL.getIndexTypeSizeInBits(NewTy);
   if (!getConstantRangeFromMetadata(*N).contains(APInt(BitWidth, 0))) {
     MDNode *NN = MDNode::get(OldLI.getContext(), None);
     NewLI.setMetadata(LLVMContext::MD_nonnull, NN);
@@ -2269,7 +2759,7 @@ bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) {
     // Static allocas (constant size in the entry block) are handled by
     // prologue/epilogue insertion so they're free anyway. We definitely don't
     // want to make them non-constant.
-    return !dyn_cast<AllocaInst>(I)->isStaticAlloca();
+    return !cast<AllocaInst>(I)->isStaticAlloca();
   case Instruction::GetElementPtr:
     if (OpIdx == 0)
       return true;
diff --git a/lib/Transforms/Utils/LoopRotationUtils.cpp b/lib/Transforms/Utils/LoopRotationUtils.cpp
new file mode 100644
index 000000000000..6e92e679f999
--- /dev/null
+++ b/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -0,0 +1,645 @@
+//===----------------- LoopRotationUtils.cpp -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides utilities to convert a loop into a loop with bottom test.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LoopRotationUtils.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-rotate"
+
+STATISTIC(NumRotated, "Number of loops rotated");
+
+namespace {
+/// A simple loop rotation transformation.
+class LoopRotate {
+  const unsigned MaxHeaderSize;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  AssumptionCache *AC;
+  DominatorTree *DT;
+  ScalarEvolution *SE;
+  const SimplifyQuery &SQ;
+  bool RotationOnly;
+  bool IsUtilMode;
+
+public:
+  LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,
+             const TargetTransformInfo *TTI, AssumptionCache *AC,
+             DominatorTree *DT, ScalarEvolution *SE, const SimplifyQuery &SQ,
+             bool RotationOnly, bool IsUtilMode)
+      : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE),
+        SQ(SQ), RotationOnly(RotationOnly), IsUtilMode(IsUtilMode) {}
+  bool processLoop(Loop *L);
+
+private:
+  bool rotateLoop(Loop *L, bool SimplifiedLatch);
+  bool simplifyLoopLatch(Loop *L);
+};
+} // end anonymous namespace
+
+/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the
+/// old header into the preheader.  If there were uses of the values produced by
+/// these instruction that were outside of the loop, we have to insert PHI nodes
+/// to merge the two values.  Do this now.
+static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
+                                            BasicBlock *OrigPreheader,
+                                            ValueToValueMapTy &ValueMap,
+                                SmallVectorImpl<PHINode*> *InsertedPHIs) {
+  // Remove PHI node entries that are no longer live.
+  BasicBlock::iterator I, E = OrigHeader->end();
+  for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader));
+
+  // Now fix up users of the instructions in OrigHeader, inserting PHI nodes
+  // as necessary.
+  SSAUpdater SSA(InsertedPHIs);
+  for (I = OrigHeader->begin(); I != E; ++I) {
+    Value *OrigHeaderVal = &*I;
+
+    // If there are no uses of the value (e.g. because it returns void), there
+    // is nothing to rewrite.
+    if (OrigHeaderVal->use_empty())
+      continue;
+
+    Value *OrigPreHeaderVal = ValueMap.lookup(OrigHeaderVal);
+
+    // The value now exits in two versions: the initial value in the preheader
+    // and the loop "next" value in the original header.
+    SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName());
+    SSA.AddAvailableValue(OrigHeader, OrigHeaderVal);
+    SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal);
+
+    // Visit each use of the OrigHeader instruction.
+    for (Value::use_iterator UI = OrigHeaderVal->use_begin(),
+                             UE = OrigHeaderVal->use_end();
+         UI != UE;) {
+      // Grab the use before incrementing the iterator.
+      Use &U = *UI;
+
+      // Increment the iterator before removing the use from the list.
+      ++UI;
+
+      // SSAUpdater can't handle a non-PHI use in the same block as an
+      // earlier def. We can easily handle those cases manually.
+      Instruction *UserInst = cast<Instruction>(U.getUser());
+      if (!isa<PHINode>(UserInst)) {
+        BasicBlock *UserBB = UserInst->getParent();
+
+        // The original users in the OrigHeader are already using the
+        // original definitions.
+        if (UserBB == OrigHeader)
+          continue;
+
+        // Users in the OrigPreHeader need to use the value to which the
+        // original definitions are mapped.
+        if (UserBB == OrigPreheader) {
+          U = OrigPreHeaderVal;
+          continue;
+        }
+      }
+
+      // Anything else can be handled by SSAUpdater.
+      SSA.RewriteUse(U);
+    }
+
+    // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug
+    // intrinsics.
+    SmallVector<DbgValueInst *, 1> DbgValues;
+    llvm::findDbgValues(DbgValues, OrigHeaderVal);
+    for (auto &DbgValue : DbgValues) {
+      // The original users in the OrigHeader are already using the original
+      // definitions.
+      BasicBlock *UserBB = DbgValue->getParent();
+      if (UserBB == OrigHeader)
+        continue;
+
+      // Users in the OrigPreHeader need to use the value to which the
+      // original definitions are mapped and anything else can be handled by
+      // the SSAUpdater. To avoid adding PHINodes, check if the value is
+      // available in UserBB, if not substitute undef.
+      Value *NewVal;
+      if (UserBB == OrigPreheader)
+        NewVal = OrigPreHeaderVal;
+      else if (SSA.HasValueForBlock(UserBB))
+        NewVal = SSA.GetValueInMiddleOfBlock(UserBB);
+      else
+        NewVal = UndefValue::get(OrigHeaderVal->getType());
+      DbgValue->setOperand(0,
+                           MetadataAsValue::get(OrigHeaderVal->getContext(),
+                                                ValueAsMetadata::get(NewVal)));
+    }
+  }
+}
+
+// Look for a phi which is only used outside the loop (via a LCSSA phi)
+// in the exit from the header. This means that rotating the loop can
+// remove the phi.
+static bool shouldRotateLoopExitingLatch(Loop *L) {
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *HeaderExit = Header->getTerminator()->getSuccessor(0);
+  if (L->contains(HeaderExit))
+    HeaderExit = Header->getTerminator()->getSuccessor(1);
+
+  for (auto &Phi : Header->phis()) {
+    // Look for uses of this phi in the loop/via exits other than the header.
+    if (llvm::any_of(Phi.users(), [HeaderExit](const User *U) {
+          return cast<Instruction>(U)->getParent() != HeaderExit;
+        }))
+      continue;
+    return true;
+  }
+
+  return false;
+}
+
+/// Rotate loop LP. Return true if the loop is rotated.
+///
+/// \param SimplifiedLatch is true if the latch was just folded into the final
+/// loop exit. In this case we may want to rotate even though the new latch is
+/// now an exiting branch. This rotation would have happened had the latch not
+/// been simplified. However, if SimplifiedLatch is false, then we avoid
+/// rotating loops in which the latch exits to avoid excessive or endless
+/// rotation. LoopRotate should be repeatable and converge to a canonical
+/// form. This property is satisfied because simplifying the loop latch can only
+/// happen once across multiple invocations of the LoopRotate pass.
+bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
+  // If the loop has only one block then there is not much to rotate.
+  if (L->getBlocks().size() == 1)
+    return false;
+
+  BasicBlock *OrigHeader = L->getHeader();
+  BasicBlock *OrigLatch = L->getLoopLatch();
+
+  BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
+  if (!BI || BI->isUnconditional())
+    return false;
+
+  // If the loop header is not one of the loop exiting blocks then
+  // either this loop is already rotated or it is not
+  // suitable for loop rotation transformations.
+  if (!L->isLoopExiting(OrigHeader))
+    return false;
+
+  // If the loop latch already contains a branch that leaves the loop then the
+  // loop is already rotated.
+  if (!OrigLatch)
+    return false;
+
+  // Rotate if either the loop latch does *not* exit the loop, or if the loop
+  // latch was just simplified. Or if we think it will be profitable.
+  if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false &&
+      !shouldRotateLoopExitingLatch(L))
+    return false;
+
+  // Check size of original header and reject loop if it is very big or we can't
+  // duplicate blocks inside it.
+  {
+    SmallPtrSet<const Value *, 32> EphValues;
+    CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+    CodeMetrics Metrics;
+    Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues);
+    if (Metrics.notDuplicatable) {
+      LLVM_DEBUG(
+          dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
+                 << " instructions: ";
+          L->dump());
+      return false;
+    }
+    if (Metrics.convergent) {
+      LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
+                           "instructions: ";
+                 L->dump());
+      return false;
+    }
+    if (Metrics.NumInsts > MaxHeaderSize)
+      return false;
+  }
+
+  // Now, this loop is suitable for rotation.
+  BasicBlock *OrigPreheader = L->getLoopPreheader();
+
+  // If the loop could not be converted to canonical form, it must have an
+  // indirectbr in it, just give up.
+  if (!OrigPreheader || !L->hasDedicatedExits())
+    return false;
+
+  // Anything ScalarEvolution may know about this loop or the PHI nodes
+  // in its header will soon be invalidated. We should also invalidate
+  // all outer loops because insertion and deletion of blocks that happens
+  // during the rotation may violate invariants related to backedge taken
+  // infos in them.
+  if (SE)
+    SE->forgetTopmostLoop(L);
+
+  LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
+
+  // Find new Loop header. NewHeader is a Header's one and only successor
+  // that is inside loop.  Header's other successor is outside the
+  // loop.  Otherwise loop is not suitable for rotation.
+  BasicBlock *Exit = BI->getSuccessor(0);
+  BasicBlock *NewHeader = BI->getSuccessor(1);
+  if (L->contains(Exit))
+    std::swap(Exit, NewHeader);
+  assert(NewHeader && "Unable to determine new loop header");
+  assert(L->contains(NewHeader) && !L->contains(Exit) &&
+         "Unable to determine loop header and exit blocks");
+
+  // This code assumes that the new header has exactly one predecessor.
+  // Remove any single-entry PHI nodes in it.
+  assert(NewHeader->getSinglePredecessor() &&
+         "New header doesn't have one pred!");
+  FoldSingleEntryPHINodes(NewHeader);
+
+  // Begin by walking OrigHeader and populating ValueMap with an entry for
+  // each Instruction.
+  BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
+  ValueToValueMapTy ValueMap;
+
+  // For PHI nodes, the value available in OldPreHeader is just the
+  // incoming value from OldPreHeader.
+  for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader);
+
+  // For the rest of the instructions, either hoist to the OrigPreheader if
+  // possible or create a clone in the OldPreHeader if not.
+  TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator();
+
+  // Record all debug intrinsics preceding LoopEntryBranch to avoid duplication.
+  using DbgIntrinsicHash =
+      std::pair<std::pair<Value *, DILocalVariable *>, DIExpression *>;
+  auto makeHash = [](DbgInfoIntrinsic *D) -> DbgIntrinsicHash {
+    return {{D->getVariableLocation(), D->getVariable()}, D->getExpression()};
+  };
+  SmallDenseSet<DbgIntrinsicHash, 8> DbgIntrinsics;
+  for (auto I = std::next(OrigPreheader->rbegin()), E = OrigPreheader->rend();
+       I != E; ++I) {
+    if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&*I))
+      DbgIntrinsics.insert(makeHash(DII));
+    else
+      break;
+  }
+
+  while (I != E) {
+    Instruction *Inst = &*I++;
+
+    // If the instruction's operands are invariant and it doesn't read or write
+    // memory, then it is safe to hoist.  Doing this doesn't change the order of
+    // execution in the preheader, but does prevent the instruction from
+    // executing in each iteration of the loop.  This means it is safe to hoist
+    // something that might trap, but isn't safe to hoist something that reads
+    // memory (without proving that the loop doesn't write).
+    if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
+        !Inst->mayWriteToMemory() && !isa<TerminatorInst>(Inst) &&
+        !isa<DbgInfoIntrinsic>(Inst) && !isa<AllocaInst>(Inst)) {
+      Inst->moveBefore(LoopEntryBranch);
+      continue;
+    }
+
+    // Otherwise, create a duplicate of the instruction.
+    Instruction *C = Inst->clone();
+
+    // Eagerly remap the operands of the instruction.
+    RemapInstruction(C, ValueMap,
+                     RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+
+    // Avoid inserting the same intrinsic twice.
+    if (auto *DII = dyn_cast<DbgInfoIntrinsic>(C))
+      if (DbgIntrinsics.count(makeHash(DII))) {
+        C->deleteValue();
+        continue;
+      }
+
+    // With the operands remapped, see if the instruction constant folds or is
+    // otherwise simplifyable.  This commonly occurs because the entry from PHI
+    // nodes allows icmps and other instructions to fold.
+    Value *V = SimplifyInstruction(C, SQ);
+    if (V && LI->replacementPreservesLCSSAForm(C, V)) {
+      // If so, then delete the temporary instruction and stick the folded value
+      // in the map.
+      ValueMap[Inst] = V;
+      if (!C->mayHaveSideEffects()) {
+        C->deleteValue();
+        C = nullptr;
+      }
+    } else {
+      ValueMap[Inst] = C;
+    }
+    if (C) {
+      // Otherwise, stick the new instruction into the new block!
+      C->setName(Inst->getName());
+      C->insertBefore(LoopEntryBranch);
+
+      if (auto *II = dyn_cast<IntrinsicInst>(C))
+        if (II->getIntrinsicID() == Intrinsic::assume)
+          AC->registerAssumption(II);
+    }
+  }
+
+  // Along with all the other instructions, we just cloned OrigHeader's
+  // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
+  // successors by duplicating their incoming values for OrigHeader.
+  TerminatorInst *TI = OrigHeader->getTerminator();
+  for (BasicBlock *SuccBB : TI->successors())
+    for (BasicBlock::iterator BI = SuccBB->begin();
+         PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+      PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
+
+  // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
+  // OrigPreHeader's old terminator (the original branch into the loop), and
+  // remove the corresponding incoming values from the PHI nodes in OrigHeader.
+  LoopEntryBranch->eraseFromParent();
+
+
+  SmallVector<PHINode*, 2> InsertedPHIs;
+  // If there were any uses of instructions in the duplicated block outside the
+  // loop, update them, inserting PHI nodes as required
+  RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap,
+                                  &InsertedPHIs);
+
+  // Attach dbg.value intrinsics to the new phis if that phi uses a value that
+  // previously had debug metadata attached. This keeps the debug info
+  // up-to-date in the loop body.
+  if (!InsertedPHIs.empty())
+    insertDebugValuesForPHIs(OrigHeader, InsertedPHIs);
+
+  // NewHeader is now the header of the loop.
+  L->moveToHeader(NewHeader);
+  assert(L->getHeader() == NewHeader && "Latch block is our new header");
+
+  // Inform DT about changes to the CFG.
+  if (DT) {
+    // The OrigPreheader branches to the NewHeader and Exit now. Then, inform
+    // the DT about the removed edge to the OrigHeader (that got removed).
+    SmallVector<DominatorTree::UpdateType, 3> Updates;
+    Updates.push_back({DominatorTree::Insert, OrigPreheader, Exit});
+    Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader});
+    Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader});
+    DT->applyUpdates(Updates);
+  }
+
+  // At this point, we've finished our major CFG changes.  As part of cloning
+  // the loop into the preheader we've simplified instructions and the
+  // duplicated conditional branch may now be branching on a constant.  If it is
+  // branching on a constant and if that constant means that we enter the loop,
+  // then we fold away the cond branch to an uncond branch.  This simplifies the
+  // loop in cases important for nested loops, and it also means we don't have
+  // to split as many edges.
+  BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
+  assert(PHBI->isConditional() && "Should be clone of BI condbr!");
+  if (!isa<ConstantInt>(PHBI->getCondition()) ||
+      PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) !=
+          NewHeader) {
+    // The conditional branch can't be folded, handle the general case.
+    // Split edges as necessary to preserve LoopSimplify form.
+
+    // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
+    // thus is not a preheader anymore.
+    // Split the edge to form a real preheader.
+    BasicBlock *NewPH = SplitCriticalEdge(
+        OrigPreheader, NewHeader,
+        CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
+    NewPH->setName(NewHeader->getName() + ".lr.ph");
+
+    // Preserve canonical loop form, which means that 'Exit' should have only
+    // one predecessor. Note that Exit could be an exit block for multiple
+    // nested loops, causing both of the edges to now be critical and need to
+    // be split.
+    SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit));
+    bool SplitLatchEdge = false;
+    for (BasicBlock *ExitPred : ExitPreds) {
+      // We only need to split loop exit edges.
+      Loop *PredLoop = LI->getLoopFor(ExitPred);
+      if (!PredLoop || PredLoop->contains(Exit))
+        continue;
+      if (isa<IndirectBrInst>(ExitPred->getTerminator()))
+        continue;
+      SplitLatchEdge |= L->getLoopLatch() == ExitPred;
+      BasicBlock *ExitSplit = SplitCriticalEdge(
+          ExitPred, Exit,
+          CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
+      ExitSplit->moveBefore(Exit);
+    }
+    assert(SplitLatchEdge &&
+           "Despite splitting all preds, failed to split latch exit?");
+  } else {
+    // We can fold the conditional branch in the preheader, this makes things
+    // simpler. The first step is to remove the extra edge to the Exit block.
+    Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
+    BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI);
+    NewBI->setDebugLoc(PHBI->getDebugLoc());
+    PHBI->eraseFromParent();
+
+    // With our CFG finalized, update DomTree if it is available.
+    if (DT) DT->deleteEdge(OrigPreheader, Exit);
+  }
+
+  assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
+  assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
+
+  // Now that the CFG and DomTree are in a consistent state again, try to merge
+  // the OrigHeader block into OrigLatch.  This will succeed if they are
+  // connected by an unconditional branch.  This is just a cleanup so the
+  // emitted code isn't too gross in this common case.
+  MergeBlockIntoPredecessor(OrigHeader, DT, LI);
+
+  LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump());
+
+  ++NumRotated;
+  return true;
+}
+
+/// Determine whether the instructions in this range may be safely and cheaply
+/// speculated. This is not an important enough situation to develop complex
+/// heuristics. We handle a single arithmetic instruction along with any type
+/// conversions.
+static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
+                                  BasicBlock::iterator End, Loop *L) {
+  bool seenIncrement = false;
+  bool MultiExitLoop = false;
+
+  if (!L->getExitingBlock())
+    MultiExitLoop = true;
+
+  for (BasicBlock::iterator I = Begin; I != End; ++I) {
+
+    if (!isSafeToSpeculativelyExecute(&*I))
+      return false;
+
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    switch (I->getOpcode()) {
+    default:
+      return false;
+    case Instruction::GetElementPtr:
+      // GEPs are cheap if all indices are constant.
+      if (!cast<GEPOperator>(I)->hasAllConstantIndices())
+        return false;
+      // fall-thru to increment case
+      LLVM_FALLTHROUGH;
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr: {
+      Value *IVOpnd =
+          !isa<Constant>(I->getOperand(0))
+              ? I->getOperand(0)
+              : !isa<Constant>(I->getOperand(1)) ? I->getOperand(1) : nullptr;
+      if (!IVOpnd)
+        return false;
+
+      // If increment operand is used outside of the loop, this speculation
+      // could cause extra live range interference.
+      if (MultiExitLoop) {
+        for (User *UseI : IVOpnd->users()) {
+          auto *UserInst = cast<Instruction>(UseI);
+          if (!L->contains(UserInst))
+            return false;
+        }
+      }
+
+      if (seenIncrement)
+        return false;
+      seenIncrement = true;
+      break;
+    }
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      // ignore type conversions
+      break;
+    }
+  }
+  return true;
+}
+
+/// Fold the loop tail into the loop exit by speculating the loop tail
+/// instructions. Typically, this is a single post-increment. In the case of a
+/// simple 2-block loop, hoisting the increment can be much better than
+/// duplicating the entire loop header. In the case of loops with early exits,
+/// rotation will not work anyway, but simplifyLoopLatch will put the loop in
+/// canonical form so downstream passes can handle it.
+///
+/// I don't believe this invalidates SCEV.
+bool LoopRotate::simplifyLoopLatch(Loop *L) {
+  BasicBlock *Latch = L->getLoopLatch();
+  if (!Latch || Latch->hasAddressTaken())
+    return false;
+
+  BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator());
+  if (!Jmp || !Jmp->isUnconditional())
+    return false;
+
+  BasicBlock *LastExit = Latch->getSinglePredecessor();
+  if (!LastExit || !L->isLoopExiting(LastExit))
+    return false;
+
+  BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator());
+  if (!BI)
+    return false;
+
+  if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
+                    << LastExit->getName() << "\n");
+
+  // Hoist the instructions from Latch into LastExit.
+  LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(),
+                                 Latch->begin(), Jmp->getIterator());
+
+  unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1;
+  BasicBlock *Header = Jmp->getSuccessor(0);
+  assert(Header == L->getHeader() && "expected a backward branch");
+
+  // Remove Latch from the CFG so that LastExit becomes the new Latch.
+  BI->setSuccessor(FallThruPath, Header);
+  Latch->replaceSuccessorsPhiUsesWith(LastExit);
+  Jmp->eraseFromParent();
+
+  // Nuke the Latch block.
+  assert(Latch->empty() && "unable to evacuate Latch");
+  LI->removeBlock(Latch);
+  if (DT)
+    DT->eraseNode(Latch);
+  Latch->eraseFromParent();
+  return true;
+}
+
+/// Rotate \c L, and return true if any modification was made.
+bool LoopRotate::processLoop(Loop *L) {
+  // Save the loop metadata.
+  MDNode *LoopMD = L->getLoopID();
+
+  bool SimplifiedLatch = false;
+
+  // Simplify the loop latch before attempting to rotate the header
+  // upward. Rotation may not be needed if the loop tail can be folded into the
+  // loop exit.
+  if (!RotationOnly)
+    SimplifiedLatch = simplifyLoopLatch(L);
+
+  bool MadeChange = rotateLoop(L, SimplifiedLatch);
+  assert((!MadeChange || L->isLoopExiting(L->getLoopLatch())) &&
+         "Loop latch should be exiting after loop-rotate.");
+
+  // Restore the loop metadata.
+  // NB! We presume LoopRotation DOESN'T ADD its own metadata.
+  if ((MadeChange || SimplifiedLatch) && LoopMD)
+    L->setLoopID(LoopMD);
+
+  return MadeChange || SimplifiedLatch;
+}
+
+
+/// The utility to convert a loop into a loop with bottom test.
+bool llvm::LoopRotation(Loop *L, LoopInfo *LI, const TargetTransformInfo *TTI,
+                        AssumptionCache *AC, DominatorTree *DT,
+                        ScalarEvolution *SE, const SimplifyQuery &SQ,
+                        bool RotationOnly = true,
+                        unsigned Threshold = unsigned(-1),
+                        bool IsUtilMode = true) {
+  LoopRotate LR(Threshold, LI, TTI, AC, DT, SE, SQ, RotationOnly, IsUtilMode);
+
+  return LR.processLoop(L);
+}
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index f43af9772771..970494eb4704 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -52,6 +52,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -64,9 +65,8 @@
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 using namespace llvm;
 
@@ -141,8 +141,8 @@ BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, DominatorTree *DT,
   if (!PreheaderBB)
     return nullptr;
 
-  DEBUG(dbgs() << "LoopSimplify: Creating pre-header "
-               << PreheaderBB->getName() << "\n");
+  LLVM_DEBUG(dbgs() << "LoopSimplify: Creating pre-header "
+                    << PreheaderBB->getName() << "\n");
 
   // Make sure that NewBB is put someplace intelligent, which doesn't mess up
   // code layout too horribly.
@@ -170,7 +170,7 @@ static void addBlockAndPredsToSet(BasicBlock *InputBB, BasicBlock *StopBlock,
   } while (!Worklist.empty());
 }
 
-/// \brief The first part of loop-nestification is to find a PHI node that tells
+/// The first part of loop-nestification is to find a PHI node that tells
 /// us how to partition the loops.
 static PHINode *findPHIToPartitionLoops(Loop *L, DominatorTree *DT,
                                         AssumptionCache *AC) {
@@ -195,7 +195,7 @@ static PHINode *findPHIToPartitionLoops(Loop *L, DominatorTree *DT,
   return nullptr;
 }
 
-/// \brief If this loop has multiple backedges, try to pull one of them out into
+/// If this loop has multiple backedges, try to pull one of them out into
 /// a nested loop.
 ///
 /// This is important for code that looks like
@@ -242,7 +242,7 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
       OuterLoopPreds.push_back(PN->getIncomingBlock(i));
     }
   }
-  DEBUG(dbgs() << "LoopSimplify: Splitting out a new outer loop\n");
+  LLVM_DEBUG(dbgs() << "LoopSimplify: Splitting out a new outer loop\n");
 
   // If ScalarEvolution is around and knows anything about values in
   // this loop, tell it to forget them, because we're about to
@@ -332,7 +332,7 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
   return NewOuter;
 }
 
-/// \brief This method is called when the specified loop has more than one
+/// This method is called when the specified loop has more than one
 /// backedge in it.
 ///
 /// If this occurs, revector all of these backedges to target a new basic block
@@ -371,8 +371,8 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
   BranchInst *BETerminator = BranchInst::Create(Header, BEBlock);
   BETerminator->setDebugLoc(Header->getFirstNonPHI()->getDebugLoc());
 
-  DEBUG(dbgs() << "LoopSimplify: Inserting unique backedge block "
-               << BEBlock->getName() << "\n");
+  LLVM_DEBUG(dbgs() << "LoopSimplify: Inserting unique backedge block "
+                    << BEBlock->getName() << "\n");
 
   // Move the new backedge block to right after the last backedge block.
   Function::iterator InsertPos = ++BackedgeBlocks.back()->getIterator();
@@ -457,7 +457,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
   return BEBlock;
 }
 
-/// \brief Simplify one loop and queue further loops for simplification.
+/// Simplify one loop and queue further loops for simplification.
 static bool simplifyOneLoop(Loop *L, SmallVectorImpl<Loop *> &Worklist,
                             DominatorTree *DT, LoopInfo *LI,
                             ScalarEvolution *SE, AssumptionCache *AC,
@@ -484,8 +484,8 @@ ReprocessLoop:
     // Delete each unique out-of-loop (and thus dead) predecessor.
     for (BasicBlock *P : BadPreds) {
 
-      DEBUG(dbgs() << "LoopSimplify: Deleting edge from dead predecessor "
-                   << P->getName() << "\n");
+      LLVM_DEBUG(dbgs() << "LoopSimplify: Deleting edge from dead predecessor "
+                        << P->getName() << "\n");
 
       // Zap the dead pred's terminator and replace it with unreachable.
       TerminatorInst *TI = P->getTerminator();
@@ -504,16 +504,13 @@ ReprocessLoop:
       if (BI->isConditional()) {
         if (UndefValue *Cond = dyn_cast<UndefValue>(BI->getCondition())) {
 
-          DEBUG(dbgs() << "LoopSimplify: Resolving \"br i1 undef\" to exit in "
-                       << ExitingBlock->getName() << "\n");
+          LLVM_DEBUG(dbgs()
+                     << "LoopSimplify: Resolving \"br i1 undef\" to exit in "
+                     << ExitingBlock->getName() << "\n");
 
           BI->setCondition(ConstantInt::get(Cond->getType(),
                                             !L->contains(BI->getSuccessor(0))));
 
-          // This may make the loop analyzable, force SCEV recomputation.
-          if (SE)
-            SE->forgetLoop(L);
-
           Changed = true;
         }
       }
@@ -617,11 +614,8 @@ ReprocessLoop:
       // comparison and the branch.
       bool AllInvariant = true;
       bool AnyInvariant = false;
-      for (BasicBlock::iterator I = ExitingBlock->begin(); &*I != BI; ) {
+      for (auto I = ExitingBlock->instructionsWithoutDebug().begin(); &*I != BI; ) {
         Instruction *Inst = &*I++;
-        // Skip debug info intrinsics.
-        if (isa<DbgInfoIntrinsic>(Inst))
-          continue;
         if (Inst == CI)
           continue;
         if (!L->makeLoopInvariant(Inst, AnyInvariant,
@@ -648,15 +642,8 @@ ReprocessLoop:
 
       // Success. The block is now dead, so remove it from the loop,
       // update the dominator tree and delete it.
-      DEBUG(dbgs() << "LoopSimplify: Eliminating exiting block "
-                   << ExitingBlock->getName() << "\n");
-
-      // Notify ScalarEvolution before deleting this block. Currently assume the
-      // parent loop doesn't change (spliting edges doesn't count). If blocks,
-      // CFG edges, or other values in the parent loop change, then we need call
-      // to forgetLoop() for the parent instead.
-      if (SE)
-        SE->forgetLoop(L);
+      LLVM_DEBUG(dbgs() << "LoopSimplify: Eliminating exiting block "
+                        << ExitingBlock->getName() << "\n");
 
       assert(pred_begin(ExitingBlock) == pred_end(ExitingBlock));
       Changed = true;
@@ -679,6 +666,12 @@ ReprocessLoop:
     }
   }
 
+  // Changing exit conditions for blocks may affect exit counts of this loop and
+  // any of its paretns, so we must invalidate the entire subtree if we've made
+  // any changes.
+  if (Changed && SE)
+    SE->forgetTopmostLoop(L);
+
   return Changed;
 }
 
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index dc98a39adcc5..04b8c1417e0a 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -33,7 +34,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
@@ -63,8 +63,7 @@ UnrollVerifyDomtree("unroll-verify-domtree", cl::Hidden,
 
 /// Convert the instruction operands from referencing the current values into
 /// those specified by VMap.
-static inline void remapInstruction(Instruction *I,
-                                    ValueToValueMapTy &VMap) {
+void llvm::remapInstruction(Instruction *I, ValueToValueMapTy &VMap) {
   for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
     Value *Op = I->getOperand(op);
 
@@ -97,16 +96,10 @@ static inline void remapInstruction(Instruction *I,
 
 /// Folds a basic block into its predecessor if it only has one predecessor, and
 /// that predecessor only has one successor.
-/// The LoopInfo Analysis that is passed will be kept consistent.  If folding is
-/// successful references to the containing loop must be removed from
-/// ScalarEvolution by calling ScalarEvolution::forgetLoop because SE may have
-/// references to the eliminated BB.  The argument ForgottenLoops contains a set
-/// of loops that have already been forgotten to prevent redundant, expensive
-/// calls to ScalarEvolution::forgetLoop.  Returns the new combined block.
-static BasicBlock *
-foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI, ScalarEvolution *SE,
-                         SmallPtrSetImpl<Loop *> &ForgottenLoops,
-                         DominatorTree *DT) {
+/// The LoopInfo Analysis that is passed will be kept consistent.
+BasicBlock *llvm::foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI,
+                                           ScalarEvolution *SE,
+                                           DominatorTree *DT) {
   // Merge basic blocks into their predecessor if there is only one distinct
   // pred, and if there is only one distinct successor of the predecessor, and
   // if there are no PHI nodes.
@@ -116,7 +109,8 @@ foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI, ScalarEvolution *SE,
   if (OnlyPred->getTerminator()->getNumSuccessors() != 1)
     return nullptr;
 
-  DEBUG(dbgs() << "Merging: " << *BB << "into: " << *OnlyPred);
+  LLVM_DEBUG(dbgs() << "Merging: " << BB->getName() << " into "
+                    << OnlyPred->getName() << "\n");
 
   // Resolve any PHI nodes at the start of the block.  They are all
   // guaranteed to have exactly one entry if they exist, unless there are
@@ -149,13 +143,6 @@ foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI, ScalarEvolution *SE,
       DT->eraseNode(BB);
     }
 
-  // ScalarEvolution holds references to loop exit blocks.
-  if (SE) {
-    if (Loop *L = LI->getLoopFor(BB)) {
-      if (ForgottenLoops.insert(L).second)
-        SE->forgetLoop(L);
-    }
-  }
   LI->removeBlock(BB);
 
   // Inherit predecessor's name if it exists...
@@ -258,16 +245,55 @@ static bool isEpilogProfitable(Loop *L) {
   BasicBlock *PreHeader = L->getLoopPreheader();
   BasicBlock *Header = L->getHeader();
   assert(PreHeader && Header);
-  for (Instruction &BBI : *Header) {
-    PHINode *PN = dyn_cast<PHINode>(&BBI);
-    if (!PN)
-      break;
-    if (isa<ConstantInt>(PN->getIncomingValueForBlock(PreHeader)))
+  for (const PHINode &PN : Header->phis()) {
+    if (isa<ConstantInt>(PN.getIncomingValueForBlock(PreHeader)))
       return true;
   }
   return false;
 }
 
+/// Perform some cleanup and simplifications on loops after unrolling. It is
+/// useful to simplify the IV's in the new loop, as well as do a quick
+/// simplify/dce pass of the instructions.
+void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
+                                   ScalarEvolution *SE, DominatorTree *DT,
+                                   AssumptionCache *AC) {
+  // Simplify any new induction variables in the partially unrolled loop.
+  if (SE && SimplifyIVs) {
+    SmallVector<WeakTrackingVH, 16> DeadInsts;
+    simplifyLoopIVs(L, SE, DT, LI, DeadInsts);
+
+    // Aggressively clean up dead instructions that simplifyLoopIVs already
+    // identified. Any remaining should be cleaned up below.
+    while (!DeadInsts.empty())
+      if (Instruction *Inst =
+              dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()))
+        RecursivelyDeleteTriviallyDeadInstructions(Inst);
+  }
+
+  // At this point, the code is well formed.  We now do a quick sweep over the
+  // inserted code, doing constant propagation and dead code elimination as we
+  // go.
+  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+  const std::vector<BasicBlock *> &NewLoopBlocks = L->getBlocks();
+  for (BasicBlock *BB : NewLoopBlocks) {
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+      Instruction *Inst = &*I++;
+
+      if (Value *V = SimplifyInstruction(Inst, {DL, nullptr, DT, AC}))
+        if (LI->replacementPreservesLCSSAForm(Inst, V))
+          Inst->replaceAllUsesWith(V);
+      if (isInstructionTriviallyDead(Inst))
+        BB->getInstList().erase(Inst);
+    }
+  }
+
+  // TODO: after peeling or unrolling, previously loop variant conditions are
+  // likely to fold to constants, eagerly propagating those here will require
+  // fewer cleanup passes to be run.  Alternatively, a LoopEarlyCSE might be
+  // appropriate.
+}
+
 /// Unroll the given loop by Count. The loop must be in LCSSA form.  Unrolling
 /// can only fail when the loop's latch block is not terminated by a conditional
 /// branch instruction. However, if the trip count (and multiple) are not known,
@@ -313,19 +339,19 @@ LoopUnrollResult llvm::UnrollLoop(
 
   BasicBlock *Preheader = L->getLoopPreheader();
   if (!Preheader) {
-    DEBUG(dbgs() << "  Can't unroll; loop preheader-insertion failed.\n");
+    LLVM_DEBUG(dbgs() << "  Can't unroll; loop preheader-insertion failed.\n");
     return LoopUnrollResult::Unmodified;
   }
 
   BasicBlock *LatchBlock = L->getLoopLatch();
   if (!LatchBlock) {
-    DEBUG(dbgs() << "  Can't unroll; loop exit-block-insertion failed.\n");
+    LLVM_DEBUG(dbgs() << "  Can't unroll; loop exit-block-insertion failed.\n");
     return LoopUnrollResult::Unmodified;
   }
 
   // Loops with indirectbr cannot be cloned.
   if (!L->isSafeToClone()) {
-    DEBUG(dbgs() << "  Can't unroll; Loop body cannot be cloned.\n");
+    LLVM_DEBUG(dbgs() << "  Can't unroll; Loop body cannot be cloned.\n");
     return LoopUnrollResult::Unmodified;
   }
 
@@ -338,8 +364,9 @@ LoopUnrollResult llvm::UnrollLoop(
 
   if (!BI || BI->isUnconditional()) {
     // The loop-rotate pass can be helpful to avoid this in many cases.
-    DEBUG(dbgs() <<
-             "  Can't unroll; loop not terminated by a conditional branch.\n");
+    LLVM_DEBUG(
+        dbgs()
+        << "  Can't unroll; loop not terminated by a conditional branch.\n");
     return LoopUnrollResult::Unmodified;
   }
 
@@ -348,22 +375,22 @@ LoopUnrollResult llvm::UnrollLoop(
   };
 
   if (!CheckSuccessors(0, 1) && !CheckSuccessors(1, 0)) {
-    DEBUG(dbgs() << "Can't unroll; only loops with one conditional latch"
-                    " exiting the loop can be unrolled\n");
+    LLVM_DEBUG(dbgs() << "Can't unroll; only loops with one conditional latch"
+                         " exiting the loop can be unrolled\n");
     return LoopUnrollResult::Unmodified;
   }
 
   if (Header->hasAddressTaken()) {
     // The loop-rotate pass can be helpful to avoid this in many cases.
-    DEBUG(dbgs() <<
-          "  Won't unroll loop: address of header block is taken.\n");
+    LLVM_DEBUG(
+        dbgs() << "  Won't unroll loop: address of header block is taken.\n");
     return LoopUnrollResult::Unmodified;
   }
 
   if (TripCount != 0)
-    DEBUG(dbgs() << "  Trip Count = " << TripCount << "\n");
+    LLVM_DEBUG(dbgs() << "  Trip Count = " << TripCount << "\n");
   if (TripMultiple != 1)
-    DEBUG(dbgs() << "  Trip Multiple = " << TripMultiple << "\n");
+    LLVM_DEBUG(dbgs() << "  Trip Multiple = " << TripMultiple << "\n");
 
   // Effectively "DCE" unrolled iterations that are beyond the tripcount
   // and will never be executed.
@@ -372,7 +399,7 @@ LoopUnrollResult llvm::UnrollLoop(
 
   // Don't enter the unroll code if there is nothing to do.
   if (TripCount == 0 && Count < 2 && PeelCount == 0) {
-    DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
+    LLVM_DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
     return LoopUnrollResult::Unmodified;
   }
 
@@ -406,8 +433,9 @@ LoopUnrollResult llvm::UnrollLoop(
          "Did not expect runtime trip-count unrolling "
          "and peeling for the same loop");
 
+  bool Peeled = false;
   if (PeelCount) {
-    bool Peeled = peelLoop(L, PeelCount, LI, SE, DT, AC, PreserveLCSSA);
+    Peeled = peelLoop(L, PeelCount, LI, SE, DT, AC, PreserveLCSSA);
 
     // Successful peeling may result in a change in the loop preheader/trip
     // counts. If we later unroll the loop, we want these to be updated.
@@ -422,7 +450,7 @@ LoopUnrollResult llvm::UnrollLoop(
 
   // Loops containing convergent instructions must have a count that divides
   // their TripMultiple.
-  DEBUG(
+  LLVM_DEBUG(
       {
         bool HasConvergent = false;
         for (auto &BB : L->blocks())
@@ -445,18 +473,12 @@ LoopUnrollResult llvm::UnrollLoop(
     if (Force)
       RuntimeTripCount = false;
     else {
-      DEBUG(
-          dbgs() << "Wont unroll; remainder loop could not be generated"
-                    "when assuming runtime trip count\n");
+      LLVM_DEBUG(dbgs() << "Won't unroll; remainder loop could not be "
+                           "generated when assuming runtime trip count\n");
       return LoopUnrollResult::Unmodified;
     }
   }
 
-  // Notify ScalarEvolution that the loop will be substantially changed,
-  // if not outright eliminated.
-  if (SE)
-    SE->forgetLoop(L);
-
   // If we know the trip count, we know the multiple...
   unsigned BreakoutTrip = 0;
   if (TripCount != 0) {
@@ -471,8 +493,8 @@ LoopUnrollResult llvm::UnrollLoop(
   using namespace ore;
   // Report the unrolling decision.
   if (CompletelyUnroll) {
-    DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName()
-                 << " with trip count " << TripCount << "!\n");
+    LLVM_DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName()
+                      << " with trip count " << TripCount << "!\n");
     if (ORE)
       ORE->emit([&]() {
         return OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
@@ -481,8 +503,8 @@ LoopUnrollResult llvm::UnrollLoop(
                << NV("UnrollCount", TripCount) << " iterations";
       });
   } else if (PeelCount) {
-    DEBUG(dbgs() << "PEELING loop %" << Header->getName()
-                 << " with iteration count " << PeelCount << "!\n");
+    LLVM_DEBUG(dbgs() << "PEELING loop %" << Header->getName()
+                      << " with iteration count " << PeelCount << "!\n");
     if (ORE)
       ORE->emit([&]() {
         return OptimizationRemark(DEBUG_TYPE, "Peeled", L->getStartLoc(),
@@ -498,31 +520,42 @@ LoopUnrollResult llvm::UnrollLoop(
                   << NV("UnrollCount", Count);
     };
 
-    DEBUG(dbgs() << "UNROLLING loop %" << Header->getName()
-          << " by " << Count);
+    LLVM_DEBUG(dbgs() << "UNROLLING loop %" << Header->getName() << " by "
+                      << Count);
     if (TripMultiple == 0 || BreakoutTrip != TripMultiple) {
-      DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip);
+      LLVM_DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip);
       if (ORE)
         ORE->emit([&]() {
           return DiagBuilder() << " with a breakout at trip "
                                << NV("BreakoutTrip", BreakoutTrip);
         });
     } else if (TripMultiple != 1) {
-      DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
+      LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
       if (ORE)
         ORE->emit([&]() {
           return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
                                << " trips per branch";
         });
     } else if (RuntimeTripCount) {
-      DEBUG(dbgs() << " with run-time trip count");
+      LLVM_DEBUG(dbgs() << " with run-time trip count");
       if (ORE)
         ORE->emit(
             [&]() { return DiagBuilder() << " with run-time trip count"; });
     }
-    DEBUG(dbgs() << "!\n");
+    LLVM_DEBUG(dbgs() << "!\n");
   }
 
+  // We are going to make changes to this loop. SCEV may be keeping cached info
+  // about it, in particular about backedge taken count. The changes we make
+  // are guaranteed to invalidate this information for our loop. It is tempting
+  // to only invalidate the loop being unrolled, but it is incorrect as long as
+  // all exiting branches from all inner loops have impact on the outer loops,
+  // and if something changes inside them then any of outer loops may also
+  // change. When we forget outermost loop, we also forget all contained loops
+  // and this is what we need here.
+  if (SE)
+    SE->forgetTopmostLoop(L);
+
   bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
   BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
 
@@ -580,14 +613,9 @@ LoopUnrollResult llvm::UnrollLoop(
              "Header should not be in a sub-loop");
       // Tell LI about New.
       const Loop *OldLoop = addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
-      if (OldLoop) {
+      if (OldLoop)
         LoopsToSimplify.insert(NewLoops[OldLoop]);
 
-        // Forget the old loop, since its inputs may have changed.
-        if (SE)
-          SE->forgetLoop(OldLoop);
-      }
-
       if (*BB == Header)
         // Loop over all of the PHI nodes in the block, changing them to use
         // the incoming values from the previous block.
@@ -611,13 +639,12 @@ LoopUnrollResult llvm::UnrollLoop(
       for (BasicBlock *Succ : successors(*BB)) {
         if (L->contains(Succ))
           continue;
-        for (BasicBlock::iterator BBI = Succ->begin();
-             PHINode *phi = dyn_cast<PHINode>(BBI); ++BBI) {
-          Value *Incoming = phi->getIncomingValueForBlock(*BB);
+        for (PHINode &PHI : Succ->phis()) {
+          Value *Incoming = PHI.getIncomingValueForBlock(*BB);
           ValueToValueMapTy::iterator It = LastValueMap.find(Incoming);
           if (It != LastValueMap.end())
             Incoming = It->second;
-          phi->addIncoming(Incoming, New);
+          PHI.addIncoming(Incoming, New);
         }
       }
       // Keep track of new headers and latches as we create them, so that
@@ -721,10 +748,8 @@ LoopUnrollResult llvm::UnrollLoop(
         for (BasicBlock *Succ: successors(BB)) {
           if (Succ == Headers[i])
             continue;
-          for (BasicBlock::iterator BBI = Succ->begin();
-               PHINode *Phi = dyn_cast<PHINode>(BBI); ++BBI) {
-            Phi->removeIncomingValue(BB, false);
-          }
+          for (PHINode &Phi : Succ->phis())
+            Phi.removeIncomingValue(BB, false);
         }
       }
       // Replace the conditional branch with an unconditional one.
@@ -775,17 +800,15 @@ LoopUnrollResult llvm::UnrollLoop(
     }
   }
 
-  if (DT && UnrollVerifyDomtree)
-    DT->verifyDomTree();
+  assert(!DT || !UnrollVerifyDomtree ||
+      DT->verify(DominatorTree::VerificationLevel::Fast));
 
   // Merge adjacent basic blocks, if possible.
-  SmallPtrSet<Loop *, 4> ForgottenLoops;
   for (BasicBlock *Latch : Latches) {
     BranchInst *Term = cast<BranchInst>(Latch->getTerminator());
     if (Term->isUnconditional()) {
       BasicBlock *Dest = Term->getSuccessor(0);
-      if (BasicBlock *Fold =
-              foldBlockIntoPredecessor(Dest, LI, SE, ForgottenLoops, DT)) {
+      if (BasicBlock *Fold = foldBlockIntoPredecessor(Dest, LI, SE, DT)) {
         // Dest has been folded into Fold. Update our worklists accordingly.
         std::replace(Latches.begin(), Latches.end(), Dest, Fold);
         UnrolledLoopBlocks.erase(std::remove(UnrolledLoopBlocks.begin(),
@@ -795,40 +818,10 @@ LoopUnrollResult llvm::UnrollLoop(
     }
   }
 
-  // Simplify any new induction variables in the partially unrolled loop.
-  if (SE && !CompletelyUnroll && Count > 1) {
-    SmallVector<WeakTrackingVH, 16> DeadInsts;
-    simplifyLoopIVs(L, SE, DT, LI, DeadInsts);
-
-    // Aggressively clean up dead instructions that simplifyLoopIVs already
-    // identified. Any remaining should be cleaned up below.
-    while (!DeadInsts.empty())
-      if (Instruction *Inst =
-              dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()))
-        RecursivelyDeleteTriviallyDeadInstructions(Inst);
-  }
-
-  // At this point, the code is well formed.  We now do a quick sweep over the
-  // inserted code, doing constant propagation and dead code elimination as we
-  // go.
-  const DataLayout &DL = Header->getModule()->getDataLayout();
-  const std::vector<BasicBlock*> &NewLoopBlocks = L->getBlocks();
-  for (BasicBlock *BB : NewLoopBlocks) {
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
-      Instruction *Inst = &*I++;
-
-      if (Value *V = SimplifyInstruction(Inst, {DL, nullptr, DT, AC}))
-        if (LI->replacementPreservesLCSSAForm(Inst, V))
-          Inst->replaceAllUsesWith(V);
-      if (isInstructionTriviallyDead(Inst))
-        BB->getInstList().erase(Inst);
-    }
-  }
-
-  // TODO: after peeling or unrolling, previously loop variant conditions are
-  // likely to fold to constants, eagerly propagating those here will require
-  // fewer cleanup passes to be run.  Alternatively, a LoopEarlyCSE might be
-  // appropriate.
+  // At this point, the code is well formed.  We now simplify the unrolled loop,
+  // doing constant propagation and dead code elimination as we go.
+  simplifyLoopAfterUnroll(L, !CompletelyUnroll && (Count > 1 || Peeled), LI, SE,
+                          DT, AC);
 
   NumCompletelyUnrolled += CompletelyUnroll;
   ++NumUnrolled;
diff --git a/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/lib/Transforms/Utils/LoopUnrollAndJam.cpp
new file mode 100644
index 000000000000..b919f73c3817
--- /dev/null
+++ b/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -0,0 +1,785 @@
+//===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements loop unroll and jam as a routine, much like
+// LoopUnroll.cpp implements loop unroll.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/Utils/Local.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unroll-and-jam"
+
+STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
+STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
+
+typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet;
+
+// Partition blocks in an outer/inner loop pair into blocks before and after
+// the loop
+static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
+                                     BasicBlockSet &ForeBlocks,
+                                     BasicBlockSet &SubLoopBlocks,
+                                     BasicBlockSet &AftBlocks,
+                                     DominatorTree *DT) {
+  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
+  SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
+
+  for (BasicBlock *BB : L->blocks()) {
+    if (!SubLoop->contains(BB)) {
+      if (DT->dominates(SubLoopLatch, BB))
+        AftBlocks.insert(BB);
+      else
+        ForeBlocks.insert(BB);
+    }
+  }
+
+  // Check that all blocks in ForeBlocks together dominate the subloop
+  // TODO: This might ideally be done better with a dominator/postdominators.
+  BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
+  for (BasicBlock *BB : ForeBlocks) {
+    if (BB == SubLoopPreHeader)
+      continue;
+    TerminatorInst *TI = BB->getTerminator();
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+      if (!ForeBlocks.count(TI->getSuccessor(i)))
+        return false;
+  }
+
+  return true;
+}
+
+// Looks at the phi nodes in Header for values coming from Latch. For these
+// instructions and all their operands calls Visit on them, keeping going for
+// all the operands in AftBlocks. Returns false if Visit returns false,
+// otherwise returns true. This is used to process the instructions in the
+// Aft blocks that need to be moved before the subloop. It is used in two
+// places. One to check that the required set of instructions can be moved
+// before the loop. Then to collect the instructions to actually move in
+// moveHeaderPhiOperandsToForeBlocks.
+template <typename T>
+static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch,
+                                     BasicBlockSet &AftBlocks, T Visit) {
+  SmallVector<Instruction *, 8> Worklist;
+  for (auto &Phi : Header->phis()) {
+    Value *V = Phi.getIncomingValueForBlock(Latch);
+    if (Instruction *I = dyn_cast<Instruction>(V))
+      Worklist.push_back(I);
+  }
+
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.back();
+    Worklist.pop_back();
+    if (!Visit(I))
+      return false;
+
+    if (AftBlocks.count(I->getParent()))
+      for (auto &U : I->operands())
+        if (Instruction *II = dyn_cast<Instruction>(U))
+          Worklist.push_back(II);
+  }
+
+  return true;
+}
+
+// Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
+static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header,
+                                              BasicBlock *Latch,
+                                              Instruction *InsertLoc,
+                                              BasicBlockSet &AftBlocks) {
+  // We need to ensure we move the instructions in the correct order,
+  // starting with the earliest required instruction and moving forward.
+  std::vector<Instruction *> Visited;
+  processHeaderPhiOperands(Header, Latch, AftBlocks,
+                           [&Visited, &AftBlocks](Instruction *I) {
+                             if (AftBlocks.count(I->getParent()))
+                               Visited.push_back(I);
+                             return true;
+                           });
+
+  // Move all instructions in program order to before the InsertLoc
+  BasicBlock *InsertLocBB = InsertLoc->getParent();
+  for (Instruction *I : reverse(Visited)) {
+    if (I->getParent() != InsertLocBB)
+      I->moveBefore(InsertLoc);
+  }
+}
+
+/*
+  This method performs Unroll and Jam. For a simple loop like:
+  for (i = ..)
+    Fore(i)
+    for (j = ..)
+      SubLoop(i, j)
+    Aft(i)
+
+  Instead of doing normal inner or outer unrolling, we do:
+  for (i = .., i+=2)
+    Fore(i)
+    Fore(i+1)
+    for (j = ..)
+      SubLoop(i, j)
+      SubLoop(i+1, j)
+    Aft(i)
+    Aft(i+1)
+
+  So the outer loop is essetially unrolled and then the inner loops are fused
+  ("jammed") together into a single loop. This can increase speed when there
+  are loads in SubLoop that are invariant to i, as they become shared between
+  the now jammed inner loops.
+
+  We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
+  Fore blocks are those before the inner loop, Aft are those after. Normal
+  Unroll code is used to copy each of these sets of blocks and the results are
+  combined together into the final form above.
+
+  isSafeToUnrollAndJam should be used prior to calling this to make sure the
+  unrolling will be valid. Checking profitablility is also advisable.
+*/
+LoopUnrollResult
+llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
+                       unsigned TripMultiple, bool UnrollRemainder,
+                       LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
+                       AssumptionCache *AC, OptimizationRemarkEmitter *ORE) {
+
+  // When we enter here we should have already checked that it is safe
+  BasicBlock *Header = L->getHeader();
+  assert(L->getSubLoops().size() == 1);
+  Loop *SubLoop = *L->begin();
+
+  // Don't enter the unroll code if there is nothing to do.
+  if (TripCount == 0 && Count < 2) {
+    LLVM_DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
+    return LoopUnrollResult::Unmodified;
+  }
+
+  assert(Count > 0);
+  assert(TripMultiple > 0);
+  assert(TripCount == 0 || TripCount % TripMultiple == 0);
+
+  // Are we eliminating the loop control altogether?
+  bool CompletelyUnroll = (Count == TripCount);
+
+  // We use the runtime remainder in cases where we don't know trip multiple
+  if (TripMultiple == 1 || TripMultiple % Count != 0) {
+    if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
+                                    /*UseEpilogRemainder*/ true,
+                                    UnrollRemainder, LI, SE, DT, AC, true)) {
+      LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
+                           "generated when assuming runtime trip count\n");
+      return LoopUnrollResult::Unmodified;
+    }
+  }
+
+  // Notify ScalarEvolution that the loop will be substantially changed,
+  // if not outright eliminated.
+  if (SE) {
+    SE->forgetLoop(L);
+    SE->forgetLoop(SubLoop);
+  }
+
+  using namespace ore;
+  // Report the unrolling decision.
+  if (CompletelyUnroll) {
+    LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
+                      << Header->getName() << " with trip count " << TripCount
+                      << "!\n");
+    ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
+                                 L->getHeader())
+              << "completely unroll and jammed loop with "
+              << NV("UnrollCount", TripCount) << " iterations");
+  } else {
+    auto DiagBuilder = [&]() {
+      OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
+                              L->getHeader());
+      return Diag << "unroll and jammed loop by a factor of "
+                  << NV("UnrollCount", Count);
+    };
+
+    LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
+                      << " by " << Count);
+    if (TripMultiple != 1) {
+      LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
+      ORE->emit([&]() {
+        return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
+                             << " trips per branch";
+      });
+    } else {
+      LLVM_DEBUG(dbgs() << " with run-time trip count");
+      ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
+    }
+    LLVM_DEBUG(dbgs() << "!\n");
+  }
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
+  assert(Preheader && LatchBlock && Header);
+  assert(BI && !BI->isUnconditional());
+  bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
+  BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
+  bool SubLoopContinueOnTrue = SubLoop->contains(
+      SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
+
+  // Partition blocks in an outer/inner loop pair into blocks before and after
+  // the loop
+  BasicBlockSet SubLoopBlocks;
+  BasicBlockSet ForeBlocks;
+  BasicBlockSet AftBlocks;
+  partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
+                           DT);
+
+  // We keep track of the entering/first and exiting/last block of each of
+  // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
+  // blocks easier.
+  std::vector<BasicBlock *> ForeBlocksFirst;
+  std::vector<BasicBlock *> ForeBlocksLast;
+  std::vector<BasicBlock *> SubLoopBlocksFirst;
+  std::vector<BasicBlock *> SubLoopBlocksLast;
+  std::vector<BasicBlock *> AftBlocksFirst;
+  std::vector<BasicBlock *> AftBlocksLast;
+  ForeBlocksFirst.push_back(Header);
+  ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
+  SubLoopBlocksFirst.push_back(SubLoop->getHeader());
+  SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
+  AftBlocksFirst.push_back(SubLoop->getExitBlock());
+  AftBlocksLast.push_back(L->getExitingBlock());
+  // Maps Blocks[0] -> Blocks[It]
+  ValueToValueMapTy LastValueMap;
+
+  // Move any instructions from fore phi operands from AftBlocks into Fore.
+  moveHeaderPhiOperandsToForeBlocks(
+      Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(),
+      AftBlocks);
+
+  // The current on-the-fly SSA update requires blocks to be processed in
+  // reverse postorder so that LastValueMap contains the correct value at each
+  // exit.
+  LoopBlocksDFS DFS(L);
+  DFS.perform(LI);
+  // Stash the DFS iterators before adding blocks to the loop.
+  LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
+  LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
+
+  if (Header->getParent()->isDebugInfoForProfiling())
+    for (BasicBlock *BB : L->getBlocks())
+      for (Instruction &I : *BB)
+        if (!isa<DbgInfoIntrinsic>(&I))
+          if (const DILocation *DIL = I.getDebugLoc())
+            I.setDebugLoc(DIL->cloneWithDuplicationFactor(Count));
+
+  // Copy all blocks
+  for (unsigned It = 1; It != Count; ++It) {
+    std::vector<BasicBlock *> NewBlocks;
+    // Maps Blocks[It] -> Blocks[It-1]
+    DenseMap<Value *, Value *> PrevItValueMap;
+
+    for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
+      ValueToValueMapTy VMap;
+      BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
+      Header->getParent()->getBasicBlockList().push_back(New);
+
+      if (ForeBlocks.count(*BB)) {
+        L->addBasicBlockToLoop(New, *LI);
+
+        if (*BB == ForeBlocksFirst[0])
+          ForeBlocksFirst.push_back(New);
+        if (*BB == ForeBlocksLast[0])
+          ForeBlocksLast.push_back(New);
+      } else if (SubLoopBlocks.count(*BB)) {
+        SubLoop->addBasicBlockToLoop(New, *LI);
+
+        if (*BB == SubLoopBlocksFirst[0])
+          SubLoopBlocksFirst.push_back(New);
+        if (*BB == SubLoopBlocksLast[0])
+          SubLoopBlocksLast.push_back(New);
+      } else if (AftBlocks.count(*BB)) {
+        L->addBasicBlockToLoop(New, *LI);
+
+        if (*BB == AftBlocksFirst[0])
+          AftBlocksFirst.push_back(New);
+        if (*BB == AftBlocksLast[0])
+          AftBlocksLast.push_back(New);
+      } else {
+        llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
+      }
+
+      // Update our running maps of newest clones
+      PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
+      LastValueMap[*BB] = New;
+      for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
+           VI != VE; ++VI) {
+        PrevItValueMap[VI->second] =
+            const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);
+        LastValueMap[VI->first] = VI->second;
+      }
+
+      NewBlocks.push_back(New);
+
+      // Update DomTree:
+      if (*BB == ForeBlocksFirst[0])
+        DT->addNewBlock(New, ForeBlocksLast[It - 1]);
+      else if (*BB == SubLoopBlocksFirst[0])
+        DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
+      else if (*BB == AftBlocksFirst[0])
+        DT->addNewBlock(New, AftBlocksLast[It - 1]);
+      else {
+        // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
+        // structure.
+        auto BBDomNode = DT->getNode(*BB);
+        auto BBIDom = BBDomNode->getIDom();
+        BasicBlock *OriginalBBIDom = BBIDom->getBlock();
+        assert(OriginalBBIDom);
+        assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
+        DT->addNewBlock(
+            New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
+      }
+    }
+
+    // Remap all instructions in the most recent iteration
+    for (BasicBlock *NewBlock : NewBlocks) {
+      for (Instruction &I : *NewBlock) {
+        ::remapInstruction(&I, LastValueMap);
+        if (auto *II = dyn_cast<IntrinsicInst>(&I))
+          if (II->getIntrinsicID() == Intrinsic::assume)
+            AC->registerAssumption(II);
+      }
+    }
+
+    // Alter the ForeBlocks phi's, pointing them at the latest version of the
+    // value from the previous iteration's phis
+    for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
+      Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
+      assert(OldValue && "should have incoming edge from Aft[It]");
+      Value *NewValue = OldValue;
+      if (Value *PrevValue = PrevItValueMap[OldValue])
+        NewValue = PrevValue;
+
+      assert(Phi.getNumOperands() == 2);
+      Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
+      Phi.setIncomingValue(0, NewValue);
+      Phi.removeIncomingValue(1);
+    }
+  }
+
+  // Now that all the basic blocks for the unrolled iterations are in place,
+  // finish up connecting the blocks and phi nodes. At this point LastValueMap
+  // is the last unrolled iterations values.
+
+  // Update Phis in BB from OldBB to point to NewBB
+  auto updatePHIBlocks = [](BasicBlock *BB, BasicBlock *OldBB,
+                            BasicBlock *NewBB) {
+    for (PHINode &Phi : BB->phis()) {
+      int I = Phi.getBasicBlockIndex(OldBB);
+      Phi.setIncomingBlock(I, NewBB);
+    }
+  };
+  // Update Phis in BB from OldBB to point to NewBB and use the latest value
+  // from LastValueMap
+  auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
+                                     BasicBlock *NewBB,
+                                     ValueToValueMapTy &LastValueMap) {
+    for (PHINode &Phi : BB->phis()) {
+      for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
+        if (Phi.getIncomingBlock(b) == OldBB) {
+          Value *OldValue = Phi.getIncomingValue(b);
+          if (Value *LastValue = LastValueMap[OldValue])
+            Phi.setIncomingValue(b, LastValue);
+          Phi.setIncomingBlock(b, NewBB);
+          break;
+        }
+      }
+    }
+  };
+  // Move all the phis from Src into Dest
+  auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
+    Instruction *insertPoint = Dest->getFirstNonPHI();
+    while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
+      Phi->moveBefore(insertPoint);
+  };
+
+  // Update the PHI values outside the loop to point to the last block
+  updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
+                           LastValueMap);
+
+  // Update ForeBlocks successors and phi nodes
+  BranchInst *ForeTerm =
+      cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
+  BasicBlock *Dest = SubLoopBlocksFirst[0];
+  ForeTerm->setSuccessor(0, Dest);
+
+  if (CompletelyUnroll) {
+    while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
+      Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
+      Phi->getParent()->getInstList().erase(Phi);
+    }
+  } else {
+    // Update the PHI values to point to the last aft block
+    updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
+                             AftBlocksLast.back(), LastValueMap);
+  }
+
+  for (unsigned It = 1; It != Count; It++) {
+    // Remap ForeBlock successors from previous iteration to this
+    BranchInst *ForeTerm =
+        cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
+    BasicBlock *Dest = ForeBlocksFirst[It];
+    ForeTerm->setSuccessor(0, Dest);
+  }
+
+  // Subloop successors and phis
+  BranchInst *SubTerm =
+      cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
+  SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
+  SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
+  updatePHIBlocks(SubLoopBlocksFirst[0], ForeBlocksLast[0],
+                  ForeBlocksLast.back());
+  updatePHIBlocks(SubLoopBlocksFirst[0], SubLoopBlocksLast[0],
+                  SubLoopBlocksLast.back());
+
+  for (unsigned It = 1; It != Count; It++) {
+    // Replace the conditional branch of the previous iteration subloop with an
+    // unconditional one to this one
+    BranchInst *SubTerm =
+        cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
+    BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
+    SubTerm->eraseFromParent();
+
+    updatePHIBlocks(SubLoopBlocksFirst[It], ForeBlocksLast[It],
+                    ForeBlocksLast.back());
+    updatePHIBlocks(SubLoopBlocksFirst[It], SubLoopBlocksLast[It],
+                    SubLoopBlocksLast.back());
+    movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
+  }
+
+  // Aft blocks successors and phis
+  BranchInst *Term = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
+  if (CompletelyUnroll) {
+    BranchInst::Create(LoopExit, Term);
+    Term->eraseFromParent();
+  } else {
+    Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
+  }
+  updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0],
+                  SubLoopBlocksLast.back());
+
+  for (unsigned It = 1; It != Count; It++) {
+    // Replace the conditional branch of the previous iteration subloop with an
+    // unconditional one to this one
+    BranchInst *AftTerm =
+        cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
+    BranchInst::Create(AftBlocksFirst[It], AftTerm);
+    AftTerm->eraseFromParent();
+
+    updatePHIBlocks(AftBlocksFirst[It], SubLoopBlocksLast[It],
+                    SubLoopBlocksLast.back());
+    movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
+  }
+
+  // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
+  // new ones required.
+  if (Count != 1) {
+    SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
+    DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
+                           SubLoopBlocksFirst[0]);
+    DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
+                           SubLoopBlocksLast[0], AftBlocksFirst[0]);
+
+    DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
+                           ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
+    DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
+                           SubLoopBlocksLast.back(), AftBlocksFirst[0]);
+    DT->applyUpdates(DTUpdates);
+  }
+
+  // Merge adjacent basic blocks, if possible.
+  SmallPtrSet<BasicBlock *, 16> MergeBlocks;
+  MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
+  MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
+  MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
+  while (!MergeBlocks.empty()) {
+    BasicBlock *BB = *MergeBlocks.begin();
+    BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
+    if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) {
+      BasicBlock *Dest = Term->getSuccessor(0);
+      if (BasicBlock *Fold = foldBlockIntoPredecessor(Dest, LI, SE, DT)) {
+        // Don't remove BB and add Fold as they are the same BB
+        assert(Fold == BB);
+        (void)Fold;
+        MergeBlocks.erase(Dest);
+      } else
+        MergeBlocks.erase(BB);
+    } else
+      MergeBlocks.erase(BB);
+  }
+
+  // At this point, the code is well formed.  We now do a quick sweep over the
+  // inserted code, doing constant propagation and dead code elimination as we
+  // go.
+  simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC);
+  simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC);
+
+  NumCompletelyUnrolledAndJammed += CompletelyUnroll;
+  ++NumUnrolledAndJammed;
+
+#ifndef NDEBUG
+  // We shouldn't have done anything to break loop simplify form or LCSSA.
+  Loop *OuterL = L->getParentLoop();
+  Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop);
+  assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
+  if (!CompletelyUnroll)
+    assert(L->isLoopSimplifyForm());
+  assert(SubLoop->isLoopSimplifyForm());
+  assert(DT->verify());
+#endif
+
+  // Update LoopInfo if the loop is completely removed.
+  if (CompletelyUnroll)
+    LI->erase(L);
+
+  return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
+                          : LoopUnrollResult::PartiallyUnrolled;
+}
+
+static bool getLoadsAndStores(BasicBlockSet &Blocks,
+                              SmallVector<Value *, 4> &MemInstr) {
+  // Scan the BBs and collect legal loads and stores.
+  // Returns false if non-simple loads/stores are found.
+  for (BasicBlock *BB : Blocks) {
+    for (Instruction &I : *BB) {
+      if (auto *Ld = dyn_cast<LoadInst>(&I)) {
+        if (!Ld->isSimple())
+          return false;
+        MemInstr.push_back(&I);
+      } else if (auto *St = dyn_cast<StoreInst>(&I)) {
+        if (!St->isSimple())
+          return false;
+        MemInstr.push_back(&I);
+      } else if (I.mayReadOrWriteMemory()) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+static bool checkDependencies(SmallVector<Value *, 4> &Earlier,
+                              SmallVector<Value *, 4> &Later,
+                              unsigned LoopDepth, bool InnerLoop,
+                              DependenceInfo &DI) {
+  // Use DA to check for dependencies between loads and stores that make unroll
+  // and jam invalid
+  for (Value *I : Earlier) {
+    for (Value *J : Later) {
+      Instruction *Src = cast<Instruction>(I);
+      Instruction *Dst = cast<Instruction>(J);
+      if (Src == Dst)
+        continue;
+      // Ignore Input dependencies.
+      if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
+        continue;
+
+      // Track dependencies, and if we find them take a conservative approach
+      // by allowing only = or < (not >), altough some > would be safe
+      // (depending upon unroll width).
+      // For the inner loop, we need to disallow any (> <) dependencies
+      // FIXME: Allow > so long as distance is less than unroll width
+      if (auto D = DI.depends(Src, Dst, true)) {
+        assert(D->isOrdered() && "Expected an output, flow or anti dep.");
+
+        if (D->isConfused())
+          return false;
+        if (!InnerLoop) {
+          if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT)
+            return false;
+        } else {
+          assert(LoopDepth + 1 <= D->getLevels());
+          if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT &&
+              D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT)
+            return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+static bool checkDependencies(Loop *L, BasicBlockSet &ForeBlocks,
+                              BasicBlockSet &SubLoopBlocks,
+                              BasicBlockSet &AftBlocks, DependenceInfo &DI) {
+  // Get all loads/store pairs for each blocks
+  SmallVector<Value *, 4> ForeMemInstr;
+  SmallVector<Value *, 4> SubLoopMemInstr;
+  SmallVector<Value *, 4> AftMemInstr;
+  if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) ||
+      !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) ||
+      !getLoadsAndStores(AftBlocks, AftMemInstr))
+    return false;
+
+  // Check for dependencies between any blocks that may change order
+  unsigned LoopDepth = L->getLoopDepth();
+  return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false,
+                           DI) &&
+         checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) &&
+         checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false,
+                           DI) &&
+         checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true,
+                           DI);
+}
+
+bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
+                                DependenceInfo &DI) {
+  /* We currently handle outer loops like this:
+        |
+    ForeFirst    <----\    }
+     Blocks           |    } ForeBlocks
+    ForeLast          |    }
+        |             |
+    SubLoopFirst  <\  |    }
+     Blocks        |  |    } SubLoopBlocks
+    SubLoopLast   -/  |    }
+        |             |
+    AftFirst          |    }
+     Blocks           |    } AftBlocks
+    AftLast     ------/    }
+        |
+
+    There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
+    and AftBlocks, providing that there is one edge from Fores to SubLoops,
+    one edge from SubLoops to Afts and a single outer loop exit (from Afts).
+    In practice we currently limit Aft blocks to a single block, and limit
+    things further in the profitablility checks of the unroll and jam pass.
+
+    Because of the way we rearrange basic blocks, we also require that
+    the Fore blocks on all unrolled iterations are safe to move before the
+    SubLoop blocks of all iterations. So we require that the phi node looping
+    operands of ForeHeader can be moved to at least the end of ForeEnd, so that
+    we can arrange cloned Fore Blocks before the subloop and match up Phi's
+    correctly.
+
+    i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2.
+    It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2.
+
+    There are then a number of checks along the lines of no calls, no
+    exceptions, inner loop IV is consistent, etc. Note that for loops requiring
+    runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
+    UnrollAndJamLoop if the trip count cannot be easily calculated.
+  */
+
+  if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
+    return false;
+  Loop *SubLoop = L->getSubLoops()[0];
+  if (!SubLoop->isLoopSimplifyForm())
+    return false;
+
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Latch = L->getLoopLatch();
+  BasicBlock *Exit = L->getExitingBlock();
+  BasicBlock *SubLoopHeader = SubLoop->getHeader();
+  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
+  BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
+
+  if (Latch != Exit)
+    return false;
+  if (SubLoopLatch != SubLoopExit)
+    return false;
+
+  if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken())
+    return false;
+
+  // Split blocks into Fore/SubLoop/Aft based on dominators
+  BasicBlockSet SubLoopBlocks;
+  BasicBlockSet ForeBlocks;
+  BasicBlockSet AftBlocks;
+  if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks,
+                                AftBlocks, &DT))
+    return false;
+
+  // Aft blocks may need to move instructions to fore blocks, which becomes more
+  // difficult if there are multiple (potentially conditionally executed)
+  // blocks. For now we just exclude loops with multiple aft blocks.
+  if (AftBlocks.size() != 1)
+    return false;
+
+  // Check inner loop IV is consistent between all iterations
+  const SCEV *SubLoopBECountSC = SE.getExitCount(SubLoop, SubLoopLatch);
+  if (isa<SCEVCouldNotCompute>(SubLoopBECountSC) ||
+      !SubLoopBECountSC->getType()->isIntegerTy())
+    return false;
+  ScalarEvolution::LoopDisposition LD =
+      SE.getLoopDisposition(SubLoopBECountSC, L);
+  if (LD != ScalarEvolution::LoopInvariant)
+    return false;
+
+  // Check the loop safety info for exceptions.
+  LoopSafetyInfo LSI;
+  computeLoopSafetyInfo(&LSI, L);
+  if (LSI.MayThrow)
+    return false;
+
+  // We've ruled out the easy stuff and now need to check that there are no
+  // interdependencies which may prevent us from moving the:
+  //  ForeBlocks before Subloop and AftBlocks.
+  //  Subloop before AftBlocks.
+  //  ForeBlock phi operands before the subloop
+
+  // Make sure we can move all instructions we need to before the subloop
+  if (!processHeaderPhiOperands(
+          Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) {
+            if (SubLoop->contains(I->getParent()))
+              return false;
+            if (AftBlocks.count(I->getParent())) {
+              // If we hit a phi node in afts we know we are done (probably
+              // LCSSA)
+              if (isa<PHINode>(I))
+                return false;
+              // Can't move instructions with side effects or memory
+              // reads/writes
+              if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
+                return false;
+            }
+            // Keep going
+            return true;
+          }))
+    return false;
+
+  // Check for memory dependencies which prohibit the unrolling we are doing.
+  // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
+  // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
+  if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI))
+    return false;
+
+  return true;
+}
diff --git a/lib/Transforms/Utils/LoopUnrollPeel.cpp b/lib/Transforms/Utils/LoopUnrollPeel.cpp
index c84ae7d693d7..13794c53f24b 100644
--- a/lib/Transforms/Utils/LoopUnrollPeel.cpp
+++ b/lib/Transforms/Utils/LoopUnrollPeel.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Dominators.h"
@@ -30,6 +31,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -46,6 +48,7 @@
 #include <limits>
 
 using namespace llvm;
+using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "loop-unroll"
 
@@ -66,7 +69,7 @@ static const unsigned InfiniteIterationsToInvariance =
     std::numeric_limits<unsigned>::max();
 
 // Check whether we are capable of peeling this loop.
-static bool canPeel(Loop *L) {
+bool llvm::canPeel(Loop *L) {
   // Make sure the loop is in simplified form
   if (!L->isLoopSimplifyForm())
     return false;
@@ -136,11 +139,109 @@ static unsigned calculateIterationsToInvariance(
   return ToInvariance;
 }
 
+// Return the number of iterations to peel off that make conditions in the
+// body true/false. For example, if we peel 2 iterations off the loop below,
+// the condition i < 2 can be evaluated at compile time.
+//  for (i = 0; i < n; i++)
+//    if (i < 2)
+//      ..
+//    else
+//      ..
+//   }
+static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
+                                         ScalarEvolution &SE) {
+  assert(L.isLoopSimplifyForm() && "Loop needs to be in loop simplify form");
+  unsigned DesiredPeelCount = 0;
+
+  for (auto *BB : L.blocks()) {
+    auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+    if (!BI || BI->isUnconditional())
+      continue;
+
+    // Ignore loop exit condition.
+    if (L.getLoopLatch() == BB)
+      continue;
+
+    Value *Condition = BI->getCondition();
+    Value *LeftVal, *RightVal;
+    CmpInst::Predicate Pred;
+    if (!match(Condition, m_ICmp(Pred, m_Value(LeftVal), m_Value(RightVal))))
+      continue;
+
+    const SCEV *LeftSCEV = SE.getSCEV(LeftVal);
+    const SCEV *RightSCEV = SE.getSCEV(RightVal);
+
+    // Do not consider predicates that are known to be true or false
+    // independently of the loop iteration.
+    if (SE.isKnownPredicate(Pred, LeftSCEV, RightSCEV) ||
+        SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), LeftSCEV,
+                            RightSCEV))
+      continue;
+
+    // Check if we have a condition with one AddRec and one non AddRec
+    // expression. Normalize LeftSCEV to be the AddRec.
+    if (!isa<SCEVAddRecExpr>(LeftSCEV)) {
+      if (isa<SCEVAddRecExpr>(RightSCEV)) {
+        std::swap(LeftSCEV, RightSCEV);
+        Pred = ICmpInst::getSwappedPredicate(Pred);
+      } else
+        continue;
+    }
+
+    const SCEVAddRecExpr *LeftAR = cast<SCEVAddRecExpr>(LeftSCEV);
+
+    // Avoid huge SCEV computations in the loop below, make sure we only
+    // consider AddRecs of the loop we are trying to peel and avoid
+    // non-monotonic predicates, as we will not be able to simplify the loop
+    // body.
+    // FIXME: For the non-monotonic predicates ICMP_EQ and ICMP_NE we can
+    //        simplify the loop, if we peel 1 additional iteration, if there
+    //        is no wrapping.
+    bool Increasing;
+    if (!LeftAR->isAffine() || LeftAR->getLoop() != &L ||
+        !SE.isMonotonicPredicate(LeftAR, Pred, Increasing))
+      continue;
+    (void)Increasing;
+
+    // Check if extending the current DesiredPeelCount lets us evaluate Pred
+    // or !Pred in the loop body statically.
+    unsigned NewPeelCount = DesiredPeelCount;
+
+    const SCEV *IterVal = LeftAR->evaluateAtIteration(
+        SE.getConstant(LeftSCEV->getType(), NewPeelCount), SE);
+
+    // If the original condition is not known, get the negated predicate
+    // (which holds on the else branch) and check if it is known. This allows
+    // us to peel of iterations that make the original condition false.
+    if (!SE.isKnownPredicate(Pred, IterVal, RightSCEV))
+      Pred = ICmpInst::getInversePredicate(Pred);
+
+    const SCEV *Step = LeftAR->getStepRecurrence(SE);
+    while (NewPeelCount < MaxPeelCount &&
+           SE.isKnownPredicate(Pred, IterVal, RightSCEV)) {
+      IterVal = SE.getAddExpr(IterVal, Step);
+      NewPeelCount++;
+    }
+
+    // Only peel the loop if the monotonic predicate !Pred becomes known in the
+    // first iteration of the loop body after peeling.
+    if (NewPeelCount > DesiredPeelCount &&
+        SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), IterVal,
+                            RightSCEV))
+      DesiredPeelCount = NewPeelCount;
+  }
+
+  return DesiredPeelCount;
+}
+
 // Return the number of iterations we want to peel off.
 void llvm::computePeelCount(Loop *L, unsigned LoopSize,
                             TargetTransformInfo::UnrollingPreferences &UP,
-                            unsigned &TripCount) {
+                            unsigned &TripCount, ScalarEvolution &SE) {
   assert(LoopSize > 0 && "Zero loop size is not allowed!");
+  // Save the UP.PeelCount value set by the target in
+  // TTI.getUnrollingPreferences or by the flag -unroll-peel-count.
+  unsigned TargetPeelCount = UP.PeelCount;
   UP.PeelCount = 0;
   if (!canPeel(L))
     return;
@@ -149,6 +250,19 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
   if (!L->empty())
     return;
 
+  // If the user provided a peel count, use that.
+  bool UserPeelCount = UnrollForcePeelCount.getNumOccurrences() > 0;
+  if (UserPeelCount) {
+    LLVM_DEBUG(dbgs() << "Force-peeling first " << UnrollForcePeelCount
+                      << " iterations.\n");
+    UP.PeelCount = UnrollForcePeelCount;
+    return;
+  }
+
+  // Skip peeling if it's disabled.
+  if (!UP.AllowPeeling)
+    return;
+
   // Here we try to get rid of Phis which become invariants after 1, 2, ..., N
   // iterations of the loop. For this we compute the number for iterations after
   // which every Phi is guaranteed to become an invariant, and try to peel the
@@ -160,7 +274,9 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
     SmallDenseMap<PHINode *, unsigned> IterationsToInvariance;
     // Now go through all Phis to calculate their the number of iterations they
     // need to become invariants.
-    unsigned DesiredPeelCount = 0;
+    // Start the max computation with the UP.PeelCount value set by the target
+    // in TTI.getUnrollingPreferences or by the flag -unroll-peel-count.
+    unsigned DesiredPeelCount = TargetPeelCount;
     BasicBlock *BackEdge = L->getLoopLatch();
     assert(BackEdge && "Loop is not in simplified form?");
     for (auto BI = L->getHeader()->begin(); isa<PHINode>(&*BI); ++BI) {
@@ -170,15 +286,21 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
       if (ToInvariance != InfiniteIterationsToInvariance)
         DesiredPeelCount = std::max(DesiredPeelCount, ToInvariance);
     }
+
+    // Pay respect to limitations implied by loop size and the max peel count.
+    unsigned MaxPeelCount = UnrollPeelMaxCount;
+    MaxPeelCount = std::min(MaxPeelCount, UP.Threshold / LoopSize - 1);
+
+    DesiredPeelCount = std::max(DesiredPeelCount,
+                                countToEliminateCompares(*L, MaxPeelCount, SE));
+
     if (DesiredPeelCount > 0) {
-      // Pay respect to limitations implied by loop size and the max peel count.
-      unsigned MaxPeelCount = UnrollPeelMaxCount;
-      MaxPeelCount = std::min(MaxPeelCount, UP.Threshold / LoopSize - 1);
       DesiredPeelCount = std::min(DesiredPeelCount, MaxPeelCount);
       // Consider max peel count limitation.
       assert(DesiredPeelCount > 0 && "Wrong loop size estimation?");
-      DEBUG(dbgs() << "Peel " << DesiredPeelCount << " iteration(s) to turn"
-                   << " some Phis into invariants.\n");
+      LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount
+                        << " iteration(s) to turn"
+                        << " some Phis into invariants.\n");
       UP.PeelCount = DesiredPeelCount;
       return;
     }
@@ -189,44 +311,37 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
   if (TripCount)
     return;
 
-  // If the user provided a peel count, use that.
-  bool UserPeelCount = UnrollForcePeelCount.getNumOccurrences() > 0;
-  if (UserPeelCount) {
-    DEBUG(dbgs() << "Force-peeling first " << UnrollForcePeelCount
-                 << " iterations.\n");
-    UP.PeelCount = UnrollForcePeelCount;
-    return;
-  }
-
   // If we don't know the trip count, but have reason to believe the average
   // trip count is low, peeling should be beneficial, since we will usually
   // hit the peeled section.
   // We only do this in the presence of profile information, since otherwise
   // our estimates of the trip count are not reliable enough.
-  if (UP.AllowPeeling && L->getHeader()->getParent()->hasProfileData()) {
+  if (L->getHeader()->getParent()->hasProfileData()) {
     Optional<unsigned> PeelCount = getLoopEstimatedTripCount(L);
     if (!PeelCount)
       return;
 
-    DEBUG(dbgs() << "Profile-based estimated trip count is " << *PeelCount
-                 << "\n");
+    LLVM_DEBUG(dbgs() << "Profile-based estimated trip count is " << *PeelCount
+                      << "\n");
 
     if (*PeelCount) {
       if ((*PeelCount <= UnrollPeelMaxCount) &&
           (LoopSize * (*PeelCount + 1) <= UP.Threshold)) {
-        DEBUG(dbgs() << "Peeling first " << *PeelCount << " iterations.\n");
+        LLVM_DEBUG(dbgs() << "Peeling first " << *PeelCount
+                          << " iterations.\n");
         UP.PeelCount = *PeelCount;
         return;
       }
-      DEBUG(dbgs() << "Requested peel count: " << *PeelCount << "\n");
-      DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");
-      DEBUG(dbgs() << "Peel cost: " << LoopSize * (*PeelCount + 1) << "\n");
-      DEBUG(dbgs() << "Max peel cost: " << UP.Threshold << "\n");
+      LLVM_DEBUG(dbgs() << "Requested peel count: " << *PeelCount << "\n");
+      LLVM_DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");
+      LLVM_DEBUG(dbgs() << "Peel cost: " << LoopSize * (*PeelCount + 1)
+                        << "\n");
+      LLVM_DEBUG(dbgs() << "Max peel cost: " << UP.Threshold << "\n");
     }
   }
 }
 
-/// \brief Update the branch weights of the latch of a peeled-off loop
+/// Update the branch weights of the latch of a peeled-off loop
 /// iteration.
 /// This sets the branch weights for the latch of the recently peeled off loop
 /// iteration correctly. 
@@ -267,12 +382,12 @@ static void updateBranchWeights(BasicBlock *Header, BranchInst *LatchBR,
   }
 }
 
-/// \brief Clones the body of the loop L, putting it between \p InsertTop and \p
+/// Clones the body of the loop L, putting it between \p InsertTop and \p
 /// InsertBot.
 /// \param IterNumber The serial number of the iteration currently being
 /// peeled off.
 /// \param Exit The exit block of the original loop.
-/// \param[out] NewBlocks A list of the the blocks in the newly created clone
+/// \param[out] NewBlocks A list of the blocks in the newly created clone
 /// \param[out] VMap The value map between the loop and the new clone.
 /// \param LoopBlocks A helper for DFS-traversal of the loop.
 /// \param LVMap A value-map that maps instructions from the original loop to
@@ -376,7 +491,7 @@ static void cloneLoopBlocks(Loop *L, unsigned IterNumber, BasicBlock *InsertTop,
     LVMap[KV.first] = KV.second;
 }
 
-/// \brief Peel off the first \p PeelCount iterations of loop \p L.
+/// Peel off the first \p PeelCount iterations of loop \p L.
 ///
 /// Note that this does not peel them off as a single straight-line block.
 /// Rather, each iteration is peeled off separately, and needs to check the
@@ -388,8 +503,8 @@ static void cloneLoopBlocks(Loop *L, unsigned IterNumber, BasicBlock *InsertTop,
 bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
                     ScalarEvolution *SE, DominatorTree *DT,
                     AssumptionCache *AC, bool PreserveLCSSA) {
-  if (!canPeel(L))
-    return false;
+  assert(PeelCount > 0 && "Attempt to peel out zero iterations?");
+  assert(canPeel(L) && "Attempt to peel a loop which is not peelable?");
 
   LoopBlocksDFS LoopBlocks(L);
   LoopBlocks.perform(LI);
@@ -500,10 +615,7 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
       // the original loop body.
       if (Iter == 0)
         DT->changeImmediateDominator(Exit, cast<BasicBlock>(LVMap[Latch]));
-#ifndef NDEBUG
-      if (VerifyDomInfo)
-        DT->verifyDomTree();
-#endif
+      assert(DT->verify(DominatorTree::VerificationLevel::Fast));
     }
 
     updateBranchWeights(InsertBot, cast<BranchInst>(VMap[LatchBR]), Iter,
diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index e00541d3c812..0057b4ba7ce1 100644
--- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -21,8 +21,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -33,7 +33,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
@@ -80,25 +80,21 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
   // The new PHI node value is added as an operand of a PHI node in either
   // the loop header or the loop exit block.
   for (BasicBlock *Succ : successors(Latch)) {
-    for (Instruction &BBI : *Succ) {
-      PHINode *PN = dyn_cast<PHINode>(&BBI);
-      // Exit when we passed all PHI nodes.
-      if (!PN)
-        break;
+    for (PHINode &PN : Succ->phis()) {
       // Add a new PHI node to the prolog end block and add the
       // appropriate incoming values.
-      PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName() + ".unr",
+      PHINode *NewPN = PHINode::Create(PN.getType(), 2, PN.getName() + ".unr",
                                        PrologExit->getFirstNonPHI());
       // Adding a value to the new PHI node from the original loop preheader.
       // This is the value that skips all the prolog code.
-      if (L->contains(PN)) {
-        NewPN->addIncoming(PN->getIncomingValueForBlock(NewPreHeader),
+      if (L->contains(&PN)) {
+        NewPN->addIncoming(PN.getIncomingValueForBlock(NewPreHeader),
                            PreHeader);
       } else {
-        NewPN->addIncoming(UndefValue::get(PN->getType()), PreHeader);
+        NewPN->addIncoming(UndefValue::get(PN.getType()), PreHeader);
       }
 
-      Value *V = PN->getIncomingValueForBlock(Latch);
+      Value *V = PN.getIncomingValueForBlock(Latch);
       if (Instruction *I = dyn_cast<Instruction>(V)) {
         if (L->contains(I)) {
           V = VMap.lookup(I);
@@ -111,10 +107,10 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
       // Update the existing PHI node operand with the value from the
       // new PHI node.  How this is done depends on if the existing
       // PHI node is in the original loop block, or the exit block.
-      if (L->contains(PN)) {
-        PN->setIncomingValue(PN->getBasicBlockIndex(NewPreHeader), NewPN);
+      if (L->contains(&PN)) {
+        PN.setIncomingValue(PN.getBasicBlockIndex(NewPreHeader), NewPN);
       } else {
-        PN->addIncoming(NewPN, PrologExit);
+        PN.addIncoming(NewPN, PrologExit);
       }
     }
   }
@@ -191,11 +187,7 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
   // Exit (EpilogPN)
 
   // Update PHI nodes at NewExit and Exit.
-  for (Instruction &BBI : *NewExit) {
-    PHINode *PN = dyn_cast<PHINode>(&BBI);
-    // Exit when we passed all PHI nodes.
-    if (!PN)
-      break;
+  for (PHINode &PN : NewExit->phis()) {
     // PN should be used in another PHI located in Exit block as
     // Exit was split by SplitBlockPredecessors into Exit and NewExit
     // Basicaly it should look like:
@@ -207,14 +199,14 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
     //
     // There is EpilogPreHeader incoming block instead of NewExit as
     // NewExit was spilt 1 more time to get EpilogPreHeader.
-    assert(PN->hasOneUse() && "The phi should have 1 use");
-    PHINode *EpilogPN = cast<PHINode> (PN->use_begin()->getUser());
+    assert(PN.hasOneUse() && "The phi should have 1 use");
+    PHINode *EpilogPN = cast<PHINode>(PN.use_begin()->getUser());
     assert(EpilogPN->getParent() == Exit && "EpilogPN should be in Exit block");
 
     // Add incoming PreHeader from branch around the Loop
-    PN->addIncoming(UndefValue::get(PN->getType()), PreHeader);
+    PN.addIncoming(UndefValue::get(PN.getType()), PreHeader);
 
-    Value *V = PN->getIncomingValueForBlock(Latch);
+    Value *V = PN.getIncomingValueForBlock(Latch);
     Instruction *I = dyn_cast<Instruction>(V);
     if (I && L->contains(I))
       // If value comes from an instruction in the loop add VMap value.
@@ -242,23 +234,19 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
     // Skip this as we already updated phis in exit blocks.
     if (!L->contains(Succ))
       continue;
-    for (Instruction &BBI : *Succ) {
-      PHINode *PN = dyn_cast<PHINode>(&BBI);
-      // Exit when we passed all PHI nodes.
-      if (!PN)
-        break;
+    for (PHINode &PN : Succ->phis()) {
       // Add new PHI nodes to the loop exit block and update epilog
       // PHIs with the new PHI values.
-      PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName() + ".unr",
+      PHINode *NewPN = PHINode::Create(PN.getType(), 2, PN.getName() + ".unr",
                                        NewExit->getFirstNonPHI());
       // Adding a value to the new PHI node from the unrolling loop preheader.
-      NewPN->addIncoming(PN->getIncomingValueForBlock(NewPreHeader), PreHeader);
+      NewPN->addIncoming(PN.getIncomingValueForBlock(NewPreHeader), PreHeader);
       // Adding a value to the new PHI node from the unrolling loop latch.
-      NewPN->addIncoming(PN->getIncomingValueForBlock(Latch), Latch);
+      NewPN->addIncoming(PN.getIncomingValueForBlock(Latch), Latch);
 
       // Update the existing PHI node operand with the value from the new PHI
       // node.  Corresponding instruction in epilog loop should be PHI.
-      PHINode *VPN = cast<PHINode>(VMap[&BBI]);
+      PHINode *VPN = cast<PHINode>(VMap[&PN]);
       VPN->setIncomingValue(VPN->getBasicBlockIndex(EpilogPreHeader), NewPN);
     }
   }
@@ -430,8 +418,9 @@ canSafelyUnrollMultiExitLoop(Loop *L, SmallVectorImpl<BasicBlock *> &OtherExits,
   // UnrollRuntimeMultiExit is true. This will need updating the logic in
   // connectEpilog/connectProlog.
   if (!LatchExit->getSinglePredecessor()) {
-    DEBUG(dbgs() << "Bailout for multi-exit handling when latch exit has >1 "
-                    "predecessor.\n");
+    LLVM_DEBUG(
+        dbgs() << "Bailout for multi-exit handling when latch exit has >1 "
+                  "predecessor.\n");
     return false;
   }
   // FIXME: We bail out of multi-exit unrolling when epilog loop is generated
@@ -540,14 +529,14 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
                                       LoopInfo *LI, ScalarEvolution *SE,
                                       DominatorTree *DT, AssumptionCache *AC,
                                       bool PreserveLCSSA) {
-  DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
-  DEBUG(L->dump());
-  DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n" :
-        dbgs() << "Using prolog remainder.\n");
+  LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
+  LLVM_DEBUG(L->dump());
+  LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n"
+                                : dbgs() << "Using prolog remainder.\n");
 
   // Make sure the loop is in canonical form.
   if (!L->isLoopSimplifyForm()) {
-    DEBUG(dbgs() << "Not in simplify form!\n");
+    LLVM_DEBUG(dbgs() << "Not in simplify form!\n");
     return false;
   }
 
@@ -573,7 +562,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   // Support only single exit and exiting block unless multi-exit loop unrolling is enabled.
   if (!isMultiExitUnrollingEnabled &&
       (!L->getExitingBlock() || OtherExits.size())) {
-    DEBUG(
+    LLVM_DEBUG(
         dbgs()
         << "Multiple exit/exiting blocks in loop and multi-exit unrolling not "
            "enabled!\n");
@@ -593,7 +582,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   const SCEV *BECountSC = SE->getExitCount(L, Latch);
   if (isa<SCEVCouldNotCompute>(BECountSC) ||
       !BECountSC->getType()->isIntegerTy()) {
-    DEBUG(dbgs() << "Could not compute exit block SCEV\n");
+    LLVM_DEBUG(dbgs() << "Could not compute exit block SCEV\n");
     return false;
   }
 
@@ -603,7 +592,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   const SCEV *TripCountSC =
       SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1));
   if (isa<SCEVCouldNotCompute>(TripCountSC)) {
-    DEBUG(dbgs() << "Could not compute trip count SCEV.\n");
+    LLVM_DEBUG(dbgs() << "Could not compute trip count SCEV.\n");
     return false;
   }
 
@@ -613,15 +602,16 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   SCEVExpander Expander(*SE, DL, "loop-unroll");
   if (!AllowExpensiveTripCount &&
       Expander.isHighCostExpansion(TripCountSC, L, PreHeaderBR)) {
-    DEBUG(dbgs() << "High cost for expanding trip count scev!\n");
+    LLVM_DEBUG(dbgs() << "High cost for expanding trip count scev!\n");
     return false;
   }
 
   // This constraint lets us deal with an overflowing trip count easily; see the
   // comment on ModVal below.
   if (Log2_32(Count) > BEWidth) {
-    DEBUG(dbgs()
-          << "Count failed constraint on overflow trip count calculation.\n");
+    LLVM_DEBUG(
+        dbgs()
+        << "Count failed constraint on overflow trip count calculation.\n");
     return false;
   }
 
@@ -775,7 +765,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   // values from the cloned region. Also update the dominator info for
   // OtherExits and their immediate successors, since we have new edges into
   // OtherExits.
-  SmallSet<BasicBlock*, 8> ImmediateSuccessorsOfExitBlocks;
+  SmallPtrSet<BasicBlock*, 8> ImmediateSuccessorsOfExitBlocks;
   for (auto *BB : OtherExits) {
    for (auto &II : *BB) {
 
@@ -890,10 +880,9 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
                   NewPreHeader, VMap, DT, LI, PreserveLCSSA);
   }
 
-  // If this loop is nested, then the loop unroller changes the code in the
-  // parent loop, so the Scalar Evolution pass needs to be run again.
-  if (Loop *ParentLoop = L->getParentLoop())
-    SE->forgetLoop(ParentLoop);
+  // If this loop is nested, then the loop unroller changes the code in the any
+  // of its parent loops, so the Scalar Evolution pass needs to be run again.
+  SE->forgetTopmostLoop(L);
 
   // Canonicalize to LoopSimplifyForm both original and remainder loops. We
   // cannot rely on the LoopUnrollPass to do this because it only does
@@ -909,7 +898,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   }
 
   if (remainderLoop && UnrollRemainder) {
-    DEBUG(dbgs() << "Unrolling remainder loop\n");
+    LLVM_DEBUG(dbgs() << "Unrolling remainder loop\n");
     UnrollLoop(remainderLoop, /*Count*/ Count - 1, /*TripCount*/ Count - 1,
                /*Force*/ false, /*AllowRuntime*/ false,
                /*AllowExpensiveTripCount*/ false, /*PreserveCondBr*/ true,
diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp
index fe106e33bca1..46af120a428b 100644
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@@ -16,13 +16,16 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
@@ -30,6 +33,7 @@
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
 using namespace llvm;
@@ -77,10 +81,13 @@ bool RecurrenceDescriptor::isArithmeticRecurrenceKind(RecurrenceKind Kind) {
   return false;
 }
 
-Instruction *
-RecurrenceDescriptor::lookThroughAnd(PHINode *Phi, Type *&RT,
-                                     SmallPtrSetImpl<Instruction *> &Visited,
-                                     SmallPtrSetImpl<Instruction *> &CI) {
+/// Determines if Phi may have been type-promoted. If Phi has a single user
+/// that ANDs the Phi with a type mask, return the user. RT is updated to
+/// account for the narrower bit width represented by the mask, and the AND
+/// instruction is added to CI.
+static Instruction *lookThroughAnd(PHINode *Phi, Type *&RT,
+                                   SmallPtrSetImpl<Instruction *> &Visited,
+                                   SmallPtrSetImpl<Instruction *> &CI) {
   if (!Phi->hasOneUse())
     return Phi;
 
@@ -101,70 +108,92 @@ RecurrenceDescriptor::lookThroughAnd(PHINode *Phi, Type *&RT,
   return Phi;
 }
 
-bool RecurrenceDescriptor::getSourceExtensionKind(
-    Instruction *Start, Instruction *Exit, Type *RT, bool &IsSigned,
-    SmallPtrSetImpl<Instruction *> &Visited,
-    SmallPtrSetImpl<Instruction *> &CI) {
+/// Compute the minimal bit width needed to represent a reduction whose exit
+/// instruction is given by Exit.
+static std::pair<Type *, bool> computeRecurrenceType(Instruction *Exit,
+                                                     DemandedBits *DB,
+                                                     AssumptionCache *AC,
+                                                     DominatorTree *DT) {
+  bool IsSigned = false;
+  const DataLayout &DL = Exit->getModule()->getDataLayout();
+  uint64_t MaxBitWidth = DL.getTypeSizeInBits(Exit->getType());
+
+  if (DB) {
+    // Use the demanded bits analysis to determine the bits that are live out
+    // of the exit instruction, rounding up to the nearest power of two. If the
+    // use of demanded bits results in a smaller bit width, we know the value
+    // must be positive (i.e., IsSigned = false), because if this were not the
+    // case, the sign bit would have been demanded.
+    auto Mask = DB->getDemandedBits(Exit);
+    MaxBitWidth = Mask.getBitWidth() - Mask.countLeadingZeros();
+  }
+
+  if (MaxBitWidth == DL.getTypeSizeInBits(Exit->getType()) && AC && DT) {
+    // If demanded bits wasn't able to limit the bit width, we can try to use
+    // value tracking instead. This can be the case, for example, if the value
+    // may be negative.
+    auto NumSignBits = ComputeNumSignBits(Exit, DL, 0, AC, nullptr, DT);
+    auto NumTypeBits = DL.getTypeSizeInBits(Exit->getType());
+    MaxBitWidth = NumTypeBits - NumSignBits;
+    KnownBits Bits = computeKnownBits(Exit, DL);
+    if (!Bits.isNonNegative()) {
+      // If the value is not known to be non-negative, we set IsSigned to true,
+      // meaning that we will use sext instructions instead of zext
+      // instructions to restore the original type.
+      IsSigned = true;
+      if (!Bits.isNegative())
+        // If the value is not known to be negative, we don't known what the
+        // upper bit is, and therefore, we don't know what kind of extend we
+        // will need. In this case, just increase the bit width by one bit and
+        // use sext.
+        ++MaxBitWidth;
+    }
+  }
+  if (!isPowerOf2_64(MaxBitWidth))
+    MaxBitWidth = NextPowerOf2(MaxBitWidth);
+
+  return std::make_pair(Type::getIntNTy(Exit->getContext(), MaxBitWidth),
+                        IsSigned);
+}
+
+/// Collect cast instructions that can be ignored in the vectorizer's cost
+/// model, given a reduction exit value and the minimal type in which the
+/// reduction can be represented.
+static void collectCastsToIgnore(Loop *TheLoop, Instruction *Exit,
+                                 Type *RecurrenceType,
+                                 SmallPtrSetImpl<Instruction *> &Casts) {
 
   SmallVector<Instruction *, 8> Worklist;
-  bool FoundOneOperand = false;
-  unsigned DstSize = RT->getPrimitiveSizeInBits();
+  SmallPtrSet<Instruction *, 8> Visited;
   Worklist.push_back(Exit);
 
-  // Traverse the instructions in the reduction expression, beginning with the
-  // exit value.
   while (!Worklist.empty()) {
-    Instruction *I = Worklist.pop_back_val();
-    for (Use &U : I->operands()) {
-
-      // Terminate the traversal if the operand is not an instruction, or we
-      // reach the starting value.
-      Instruction *J = dyn_cast<Instruction>(U.get());
-      if (!J || J == Start)
-        continue;
-
-      // Otherwise, investigate the operation if it is also in the expression.
-      if (Visited.count(J)) {
-        Worklist.push_back(J);
+    Instruction *Val = Worklist.pop_back_val();
+    Visited.insert(Val);
+    if (auto *Cast = dyn_cast<CastInst>(Val))
+      if (Cast->getSrcTy() == RecurrenceType) {
+        // If the source type of a cast instruction is equal to the recurrence
+        // type, it will be eliminated, and should be ignored in the vectorizer
+        // cost model.
+        Casts.insert(Cast);
         continue;
       }
 
-      // If the operand is not in Visited, it is not a reduction operation, but
-      // it does feed into one. Make sure it is either a single-use sign- or
-      // zero-extend instruction.
-      CastInst *Cast = dyn_cast<CastInst>(J);
-      bool IsSExtInst = isa<SExtInst>(J);
-      if (!Cast || !Cast->hasOneUse() || !(isa<ZExtInst>(J) || IsSExtInst))
-        return false;
-
-      // Ensure the source type of the extend is no larger than the reduction
-      // type. It is not necessary for the types to be identical.
-      unsigned SrcSize = Cast->getSrcTy()->getPrimitiveSizeInBits();
-      if (SrcSize > DstSize)
-        return false;
-
-      // Furthermore, ensure that all such extends are of the same kind.
-      if (FoundOneOperand) {
-        if (IsSigned != IsSExtInst)
-          return false;
-      } else {
-        FoundOneOperand = true;
-        IsSigned = IsSExtInst;
-      }
-
-      // Lastly, if the source type of the extend matches the reduction type,
-      // add the extend to CI so that we can avoid accounting for it in the
-      // cost model.
-      if (SrcSize == DstSize)
-        CI.insert(Cast);
-    }
+    // Add all operands to the work list if they are loop-varying values that
+    // we haven't yet visited.
+    for (Value *O : cast<User>(Val)->operands())
+      if (auto *I = dyn_cast<Instruction>(O))
+        if (TheLoop->contains(I) && !Visited.count(I))
+          Worklist.push_back(I);
   }
-  return true;
 }
 
 bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
                                            Loop *TheLoop, bool HasFunNoNaNAttr,
-                                           RecurrenceDescriptor &RedDes) {
+                                           RecurrenceDescriptor &RedDes,
+                                           DemandedBits *DB,
+                                           AssumptionCache *AC,
+                                           DominatorTree *DT) {
   if (Phi->getNumIncomingValues() != 2)
     return false;
 
@@ -353,14 +382,49 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
   if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
     return false;
 
-  // If we think Phi may have been type-promoted, we also need to ensure that
-  // all source operands of the reduction are either SExtInsts or ZEstInsts. If
-  // so, we will be able to evaluate the reduction in the narrower bit width.
-  if (Start != Phi)
-    if (!getSourceExtensionKind(Start, ExitInstruction, RecurrenceType,
-                                IsSigned, VisitedInsts, CastInsts))
+  if (Start != Phi) {
+    // If the starting value is not the same as the phi node, we speculatively
+    // looked through an 'and' instruction when evaluating a potential
+    // arithmetic reduction to determine if it may have been type-promoted.
+    //
+    // We now compute the minimal bit width that is required to represent the
+    // reduction. If this is the same width that was indicated by the 'and', we
+    // can represent the reduction in the smaller type. The 'and' instruction
+    // will be eliminated since it will essentially be a cast instruction that
+    // can be ignore in the cost model. If we compute a different type than we
+    // did when evaluating the 'and', the 'and' will not be eliminated, and we
+    // will end up with different kinds of operations in the recurrence
+    // expression (e.g., RK_IntegerAND, RK_IntegerADD). We give up if this is
+    // the case.
+    //
+    // The vectorizer relies on InstCombine to perform the actual
+    // type-shrinking. It does this by inserting instructions to truncate the
+    // exit value of the reduction to the width indicated by RecurrenceType and
+    // then extend this value back to the original width. If IsSigned is false,
+    // a 'zext' instruction will be generated; otherwise, a 'sext' will be
+    // used.
+    //
+    // TODO: We should not rely on InstCombine to rewrite the reduction in the
+    //       smaller type. We should just generate a correctly typed expression
+    //       to begin with.
+    Type *ComputedType;
+    std::tie(ComputedType, IsSigned) =
+        computeRecurrenceType(ExitInstruction, DB, AC, DT);
+    if (ComputedType != RecurrenceType)
       return false;
 
+    // The recurrence expression will be represented in a narrower type. If
+    // there are any cast instructions that will be unnecessary, collect them
+    // in CastInsts. Note that the 'and' instruction was already included in
+    // this list.
+    //
+    // TODO: A better way to represent this may be to tag in some way all the
+    //       instructions that are a part of the reduction. The vectorizer cost
+    //       model could then apply the recurrence type to these instructions,
+    //       without needing a white list of instructions to ignore.
+    collectCastsToIgnore(TheLoop, ExitInstruction, RecurrenceType, CastInsts);
+  }
+
   // We found a reduction var if we have reached the original phi node and we
   // only have a single instruction with out-of-loop users.
 
@@ -480,48 +544,59 @@ bool RecurrenceDescriptor::hasMultipleUsesOf(
   return false;
 }
 bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
-                                          RecurrenceDescriptor &RedDes) {
+                                          RecurrenceDescriptor &RedDes,
+                                          DemandedBits *DB, AssumptionCache *AC,
+                                          DominatorTree *DT) {
 
   BasicBlock *Header = TheLoop->getHeader();
   Function &F = *Header->getParent();
   bool HasFunNoNaNAttr =
       F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
 
-  if (AddReductionVar(Phi, RK_IntegerAdd, TheLoop, HasFunNoNaNAttr, RedDes)) {
-    DEBUG(dbgs() << "Found an ADD reduction PHI." << *Phi << "\n");
+  if (AddReductionVar(Phi, RK_IntegerAdd, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+                      AC, DT)) {
+    LLVM_DEBUG(dbgs() << "Found an ADD reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RK_IntegerMult, TheLoop, HasFunNoNaNAttr, RedDes)) {
-    DEBUG(dbgs() << "Found a MUL reduction PHI." << *Phi << "\n");
+  if (AddReductionVar(Phi, RK_IntegerMult, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+                      AC, DT)) {
+    LLVM_DEBUG(dbgs() << "Found a MUL reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RK_IntegerOr, TheLoop, HasFunNoNaNAttr, RedDes)) {
-    DEBUG(dbgs() << "Found an OR reduction PHI." << *Phi << "\n");
+  if (AddReductionVar(Phi, RK_IntegerOr, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+                      AC, DT)) {
+    LLVM_DEBUG(dbgs() << "Found an OR reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RK_IntegerAnd, TheLoop, HasFunNoNaNAttr, RedDes)) {
-    DEBUG(dbgs() << "Found an AND reduction PHI." << *Phi << "\n");
+  if (AddReductionVar(Phi, RK_IntegerAnd, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+                      AC, DT)) {
+    LLVM_DEBUG(dbgs() << "Found an AND reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RK_IntegerXor, TheLoop, HasFunNoNaNAttr, RedDes)) {
-    DEBUG(dbgs() << "Found a XOR reduction PHI." << *Phi << "\n");
+  if (AddReductionVar(Phi, RK_IntegerXor, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+                      AC, DT)) {
+    LLVM_DEBUG(dbgs() << "Found a XOR reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RK_IntegerMinMax, TheLoop, HasFunNoNaNAttr,
-                      RedDes)) {
-    DEBUG(dbgs() << "Found a MINMAX reduction PHI." << *Phi << "\n");
+  if (AddReductionVar(Phi, RK_IntegerMinMax, TheLoop, HasFunNoNaNAttr, RedDes,
+                      DB, AC, DT)) {
+    LLVM_DEBUG(dbgs() << "Found a MINMAX reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RK_FloatMult, TheLoop, HasFunNoNaNAttr, RedDes)) {
-    DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n");
+  if (AddReductionVar(Phi, RK_FloatMult, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+                      AC, DT)) {
+    LLVM_DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RK_FloatAdd, TheLoop, HasFunNoNaNAttr, RedDes)) {
-    DEBUG(dbgs() << "Found an FAdd reduction PHI." << *Phi << "\n");
+  if (AddReductionVar(Phi, RK_FloatAdd, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+                      AC, DT)) {
+    LLVM_DEBUG(dbgs() << "Found an FAdd reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RK_FloatMinMax, TheLoop, HasFunNoNaNAttr, RedDes)) {
-    DEBUG(dbgs() << "Found an float MINMAX reduction PHI." << *Phi << "\n");
+  if (AddReductionVar(Phi, RK_FloatMinMax, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+                      AC, DT)) {
+    LLVM_DEBUG(dbgs() << "Found an float MINMAX reduction PHI." << *Phi
+                      << "\n");
     return true;
   }
   // Not a reduction of known type.
@@ -849,13 +924,13 @@ bool InductionDescriptor::isFPInductionPHI(PHINode *Phi, const Loop *TheLoop,
 }
 
 /// This function is called when we suspect that the update-chain of a phi node
-/// (whose symbolic SCEV expression sin \p PhiScev) contains redundant casts, 
-/// that can be ignored. (This can happen when the PSCEV rewriter adds a runtime 
-/// predicate P under which the SCEV expression for the phi can be the 
-/// AddRecurrence \p AR; See createAddRecFromPHIWithCast). We want to find the 
-/// cast instructions that are involved in the update-chain of this induction. 
-/// A caller that adds the required runtime predicate can be free to drop these 
-/// cast instructions, and compute the phi using \p AR (instead of some scev 
+/// (whose symbolic SCEV expression sin \p PhiScev) contains redundant casts,
+/// that can be ignored. (This can happen when the PSCEV rewriter adds a runtime
+/// predicate P under which the SCEV expression for the phi can be the
+/// AddRecurrence \p AR; See createAddRecFromPHIWithCast). We want to find the
+/// cast instructions that are involved in the update-chain of this induction.
+/// A caller that adds the required runtime predicate can be free to drop these
+/// cast instructions, and compute the phi using \p AR (instead of some scev
 /// expression with casts).
 ///
 /// For example, without a predicate the scev expression can take the following
@@ -890,7 +965,7 @@ static bool getCastsForInductionPHI(PredicatedScalarEvolution &PSE,
   assert(PSE.getSCEV(PN) == AR && "Unexpected phi node SCEV expression");
   const Loop *L = AR->getLoop();
 
-  // Find any cast instructions that participate in the def-use chain of 
+  // Find any cast instructions that participate in the def-use chain of
   // PhiScev in the loop.
   // FORNOW/TODO: We currently expect the def-use chain to include only
   // two-operand instructions, where one of the operands is an invariant.
@@ -978,7 +1053,7 @@ bool InductionDescriptor::isInductionPHI(PHINode *Phi, const Loop *TheLoop,
     AR = PSE.getAsAddRec(Phi);
 
   if (!AR) {
-    DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
+    LLVM_DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
     return false;
   }
 
@@ -1012,14 +1087,15 @@ bool InductionDescriptor::isInductionPHI(
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
 
   if (!AR) {
-    DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
+    LLVM_DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
     return false;
   }
 
   if (AR->getLoop() != TheLoop) {
     // FIXME: We should treat this as a uniform. Unfortunately, we
     // don't currently know how to handled uniform PHIs.
-    DEBUG(dbgs() << "LV: PHI is a recurrence with respect to an outer loop.\n");
+    LLVM_DEBUG(
+        dbgs() << "LV: PHI is a recurrence with respect to an outer loop.\n");
     return false;
   }
 
@@ -1100,11 +1176,12 @@ bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI,
         BB, InLoopPredecessors, ".loopexit", DT, LI, PreserveLCSSA);
 
     if (!NewExitBB)
-      DEBUG(dbgs() << "WARNING: Can't create a dedicated exit block for loop: "
-                   << *L << "\n");
+      LLVM_DEBUG(
+          dbgs() << "WARNING: Can't create a dedicated exit block for loop: "
+                 << *L << "\n");
     else
-      DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block "
-                   << NewExitBB->getName() << "\n");
+      LLVM_DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block "
+                        << NewExitBB->getName() << "\n");
     return true;
   };
 
@@ -1127,7 +1204,7 @@ bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI,
   return Changed;
 }
 
-/// \brief Returns the instructions that use values defined in the loop.
+/// Returns the instructions that use values defined in the loop.
 SmallVector<Instruction *, 8> llvm::findDefsUsedOutsideOfLoop(Loop *L) {
   SmallVector<Instruction *, 8> UsedOutside;
 
@@ -1204,7 +1281,7 @@ void llvm::initializeLoopPassPass(PassRegistry &Registry) {
   INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 }
 
-/// \brief Find string metadata for loop
+/// Find string metadata for loop
 ///
 /// If it has a value (e.g. {"llvm.distribute", 1} return the value as an
 /// operand or null otherwise.  If the string metadata is not found return
@@ -1321,13 +1398,12 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr,
 
   // Rewrite phis in the exit block to get their inputs from the Preheader
   // instead of the exiting block.
-  BasicBlock::iterator BI = ExitBlock->begin();
-  while (PHINode *P = dyn_cast<PHINode>(BI)) {
+  for (PHINode &P : ExitBlock->phis()) {
     // Set the zero'th element of Phi to be from the preheader and remove all
     // other incoming values. Given the loop has dedicated exits, all other
     // incoming values must be from the exiting blocks.
     int PredIndex = 0;
-    P->setIncomingBlock(PredIndex, Preheader);
+    P.setIncomingBlock(PredIndex, Preheader);
     // Removes all incoming values from all other exiting blocks (including
     // duplicate values from an exiting block).
     // Nuke all entries except the zero'th entry which is the preheader entry.
@@ -1335,13 +1411,12 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr,
     // below, to keep the indices valid for deletion (removeIncomingValues
     // updates getNumIncomingValues and shifts all values down into the operand
     // being deleted).
-    for (unsigned i = 0, e = P->getNumIncomingValues() - 1; i != e; ++i)
-      P->removeIncomingValue(e - i, false);
+    for (unsigned i = 0, e = P.getNumIncomingValues() - 1; i != e; ++i)
+      P.removeIncomingValue(e - i, false);
 
-    assert((P->getNumIncomingValues() == 1 &&
-            P->getIncomingBlock(PredIndex) == Preheader) &&
+    assert((P.getNumIncomingValues() == 1 &&
+            P.getIncomingBlock(PredIndex) == Preheader) &&
            "Should have exactly one value and that's from the preheader!");
-    ++BI;
   }
 
   // Disconnect the loop body by branching directly to its exit.
@@ -1358,6 +1433,32 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr,
     DT->deleteEdge(Preheader, L->getHeader());
   }
 
+  // Given LCSSA form is satisfied, we should not have users of instructions
+  // within the dead loop outside of the loop. However, LCSSA doesn't take
+  // unreachable uses into account. We handle them here.
+  // We could do it after drop all references (in this case all users in the
+  // loop will be already eliminated and we have less work to do but according
+  // to API doc of User::dropAllReferences only valid operation after dropping
+  // references, is deletion. So let's substitute all usages of
+  // instruction from the loop with undef value of corresponding type first.
+  for (auto *Block : L->blocks())
+    for (Instruction &I : *Block) {
+      auto *Undef = UndefValue::get(I.getType());
+      for (Value::use_iterator UI = I.use_begin(), E = I.use_end(); UI != E;) {
+        Use &U = *UI;
+        ++UI;
+        if (auto *Usr = dyn_cast<Instruction>(U.getUser()))
+          if (L->contains(Usr->getParent()))
+            continue;
+        // If we have a DT then we can check that uses outside a loop only in
+        // unreachable block.
+        if (DT)
+          assert(!DT->isReachableFromEntry(U) &&
+                 "Unexpected user in reachable block");
+        U.set(Undef);
+      }
+    }
+
   // Remove the block from the reference counting scheme, so that we can
   // delete it freely later.
   for (auto *Block : L->blocks())
@@ -1385,54 +1486,12 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr,
   }
 }
 
-/// Returns true if the instruction in a loop is guaranteed to execute at least
-/// once.
-bool llvm::isGuaranteedToExecute(const Instruction &Inst,
-                                 const DominatorTree *DT, const Loop *CurLoop,
-                                 const LoopSafetyInfo *SafetyInfo) {
-  // We have to check to make sure that the instruction dominates all
-  // of the exit blocks.  If it doesn't, then there is a path out of the loop
-  // which does not execute this instruction, so we can't hoist it.
-
-  // If the instruction is in the header block for the loop (which is very
-  // common), it is always guaranteed to dominate the exit blocks.  Since this
-  // is a common case, and can save some work, check it now.
-  if (Inst.getParent() == CurLoop->getHeader())
-    // If there's a throw in the header block, we can't guarantee we'll reach
-    // Inst.
-    return !SafetyInfo->HeaderMayThrow;
-
-  // Somewhere in this loop there is an instruction which may throw and make us
-  // exit the loop.
-  if (SafetyInfo->MayThrow)
-    return false;
-
-  // Get the exit blocks for the current loop.
-  SmallVector<BasicBlock *, 8> ExitBlocks;
-  CurLoop->getExitBlocks(ExitBlocks);
-
-  // Verify that the block dominates each of the exit blocks of the loop.
-  for (BasicBlock *ExitBlock : ExitBlocks)
-    if (!DT->dominates(Inst.getParent(), ExitBlock))
-      return false;
-
-  // As a degenerate case, if the loop is statically infinite then we haven't
-  // proven anything since there are no exit blocks.
-  if (ExitBlocks.empty())
-    return false;
-
-  // FIXME: In general, we have to prove that the loop isn't an infinite loop.
-  // See http::llvm.org/PR24078 .  (The "ExitBlocks.empty()" check above is
-  // just a special case of this.)
-  return true;
-}
-
 Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
   // Only support loops with a unique exiting block, and a latch.
   if (!L->getExitingBlock())
     return None;
 
-  // Get the branch weights for the the loop's backedge.
+  // Get the branch weights for the loop's backedge.
   BranchInst *LatchBR =
       dyn_cast<BranchInst>(L->getLoopLatch()->getTerminator());
   if (!LatchBR || LatchBR->getNumSuccessors() != 2)
@@ -1460,7 +1519,7 @@ Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
     return (FalseVal + (TrueVal / 2)) / TrueVal;
 }
 
-/// \brief Adds a 'fast' flag to floating point operations.
+/// Adds a 'fast' flag to floating point operations.
 static Value *addFastMathFlag(Value *V) {
   if (isa<FPMathOperator>(V)) {
     FastMathFlags Flags;
@@ -1470,6 +1529,38 @@ static Value *addFastMathFlag(Value *V) {
   return V;
 }
 
+// Helper to generate an ordered reduction.
+Value *
+llvm::getOrderedReduction(IRBuilder<> &Builder, Value *Acc, Value *Src,
+                          unsigned Op,
+                          RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind,
+                          ArrayRef<Value *> RedOps) {
+  unsigned VF = Src->getType()->getVectorNumElements();
+
+  // Extract and apply reduction ops in ascending order:
+  // e.g. ((((Acc + Scl[0]) + Scl[1]) + Scl[2]) + ) ... + Scl[VF-1]
+  Value *Result = Acc;
+  for (unsigned ExtractIdx = 0; ExtractIdx != VF; ++ExtractIdx) {
+    Value *Ext =
+        Builder.CreateExtractElement(Src, Builder.getInt32(ExtractIdx));
+
+    if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
+      Result = Builder.CreateBinOp((Instruction::BinaryOps)Op, Result, Ext,
+                                   "bin.rdx");
+    } else {
+      assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid &&
+             "Invalid min/max");
+      Result = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, Result,
+                                                    Ext);
+    }
+
+    if (!RedOps.empty())
+      propagateIRFlags(Result, RedOps);
+  }
+
+  return Result;
+}
+
 // Helper to generate a log2 shuffle reduction.
 Value *
 llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,
diff --git a/lib/Transforms/Utils/LoopVersioning.cpp b/lib/Transforms/Utils/LoopVersioning.cpp
index 29756d9dab7f..abbcd5f9e3b8 100644
--- a/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/lib/Transforms/Utils/LoopVersioning.cpp
@@ -140,9 +140,12 @@ void LoopVersioning::addPHINodes(
     if (!PN) {
       PN = PHINode::Create(Inst->getType(), 2, Inst->getName() + ".lver",
                            &PHIBlock->front());
-      for (auto *User : Inst->users())
-        if (!VersionedLoop->contains(cast<Instruction>(User)->getParent()))
-          User->replaceUsesOfWith(Inst, PN);
+      SmallVector<User*, 8> UsersToUpdate;
+      for (User *U : Inst->users())
+        if (!VersionedLoop->contains(cast<Instruction>(U)->getParent()))
+          UsersToUpdate.push_back(U);
+      for (User *U : UsersToUpdate)
+        U->replaceUsesOfWith(Inst, PN);
       PN->addIncoming(Inst, VersionedLoop->getExitingBlock());
     }
   }
@@ -248,7 +251,7 @@ void LoopVersioning::annotateInstWithNoAlias(Instruction *VersionedInst,
 }
 
 namespace {
-/// \brief Also expose this is a pass.  Currently this is only used for
+/// Also expose this is a pass.  Currently this is only used for
 /// unit-testing.  It adds all memchecks necessary to remove all may-aliasing
 /// array accesses from the loop.
 class LoopVersioningPass : public FunctionPass {
diff --git a/lib/Transforms/Utils/LowerInvoke.cpp b/lib/Transforms/Utils/LowerInvoke.cpp
index ee84541e526d..c852d538b0d1 100644
--- a/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/lib/Transforms/Utils/LowerInvoke.cpp
@@ -21,7 +21,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "lowerinvoke"
@@ -48,10 +48,12 @@ static bool runImpl(Function &F) {
   bool Changed = false;
   for (BasicBlock &BB : F)
     if (InvokeInst *II = dyn_cast<InvokeInst>(BB.getTerminator())) {
-      SmallVector<Value *, 16> CallArgs(II->op_begin(), II->op_end() - 3);
+      SmallVector<Value *, 16> CallArgs(II->arg_begin(), II->arg_end());
+      SmallVector<OperandBundleDef, 1> OpBundles;
+      II->getOperandBundlesAsDefs(OpBundles);
       // Insert a normal call instruction...
       CallInst *NewCall =
-          CallInst::Create(II->getCalledValue(), CallArgs, "", II);
+          CallInst::Create(II->getCalledValue(), CallArgs, OpBundles, "", II);
       NewCall->takeName(II);
       NewCall->setCallingConv(II->getCallingConv());
       NewCall->setAttributes(II->getAttributes());
diff --git a/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index 57dc225e9dab..03006ef3a2d3 100644
--- a/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -409,8 +409,8 @@ void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
                               /* SrcAddr */ Memcpy->getRawSource(),
                               /* DstAddr */ Memcpy->getRawDest(),
                               /* CopyLen */ CI,
-                              /* SrcAlign */ Memcpy->getAlignment(),
-                              /* DestAlign */ Memcpy->getAlignment(),
+                              /* SrcAlign */ Memcpy->getSourceAlignment(),
+                              /* DestAlign */ Memcpy->getDestAlignment(),
                               /* SrcIsVolatile */ Memcpy->isVolatile(),
                               /* DstIsVolatile */ Memcpy->isVolatile(),
                               /* TargetTransformInfo */ TTI);
@@ -419,8 +419,8 @@ void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
                                 /* SrcAddr */ Memcpy->getRawSource(),
                                 /* DstAddr */ Memcpy->getRawDest(),
                                 /* CopyLen */ Memcpy->getLength(),
-                                /* SrcAlign */ Memcpy->getAlignment(),
-                                /* DestAlign */ Memcpy->getAlignment(),
+                                /* SrcAlign */ Memcpy->getSourceAlignment(),
+                                /* DestAlign */ Memcpy->getDestAlignment(),
                                 /* SrcIsVolatile */ Memcpy->isVolatile(),
                                 /* DstIsVolatile */ Memcpy->isVolatile(),
                                 /* TargetTransfomrInfo */ TTI);
@@ -432,8 +432,8 @@ void llvm::expandMemMoveAsLoop(MemMoveInst *Memmove) {
                     /* SrcAddr */ Memmove->getRawSource(),
                     /* DstAddr */ Memmove->getRawDest(),
                     /* CopyLen */ Memmove->getLength(),
-                    /* SrcAlign */ Memmove->getAlignment(),
-                    /* DestAlign */ Memmove->getAlignment(),
+                    /* SrcAlign */ Memmove->getSourceAlignment(),
+                    /* DestAlign */ Memmove->getDestAlignment(),
                     /* SrcIsVolatile */ Memmove->isVolatile(),
                     /* DstIsVolatile */ Memmove->isVolatile());
 }
@@ -443,6 +443,6 @@ void llvm::expandMemSetAsLoop(MemSetInst *Memset) {
                    /* DstAddr */ Memset->getRawDest(),
                    /* CopyLen */ Memset->getLength(),
                    /* SetValue */ Memset->getValue(),
-                   /* Alignment */ Memset->getAlignment(),
+                   /* Alignment */ Memset->getDestAlignment(),
                    Memset->isVolatile());
 }
diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
index 344cb35df986..e99ecfef19cd 100644
--- a/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/lib/Transforms/Utils/LowerSwitch.cpp
@@ -29,7 +29,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <algorithm>
 #include <cassert>
@@ -74,7 +74,7 @@ namespace {
 
     LowerSwitch() : FunctionPass(ID) {
       initializeLowerSwitchPass(*PassRegistry::getPassRegistry());
-    } 
+    }
 
     bool runOnFunction(Function &F) override;
 
@@ -155,11 +155,8 @@ bool LowerSwitch::runOnFunction(Function &F) {
 }
 
 /// Used for debugging purposes.
-static raw_ostream& operator<<(raw_ostream &O,
-                               const LowerSwitch::CaseVector &C)
-    LLVM_ATTRIBUTE_USED;
-
-static raw_ostream& operator<<(raw_ostream &O,
+LLVM_ATTRIBUTE_USED
+static raw_ostream &operator<<(raw_ostream &O,
                                const LowerSwitch::CaseVector &C) {
   O << "[";
 
@@ -172,7 +169,7 @@ static raw_ostream& operator<<(raw_ostream &O,
   return O << "]";
 }
 
-/// \brief Update the first occurrence of the "switch statement" BB in the PHI
+/// Update the first occurrence of the "switch statement" BB in the PHI
 /// node with the "new" BB. The other occurrences will:
 ///
 /// 1) Be updated by subsequent calls to this function.  Switch statements may
@@ -245,14 +242,13 @@ LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound,
 
   unsigned Mid = Size / 2;
   std::vector<CaseRange> LHS(Begin, Begin + Mid);
-  DEBUG(dbgs() << "LHS: " << LHS << "\n");
+  LLVM_DEBUG(dbgs() << "LHS: " << LHS << "\n");
   std::vector<CaseRange> RHS(Begin + Mid, End);
-  DEBUG(dbgs() << "RHS: " << RHS << "\n");
+  LLVM_DEBUG(dbgs() << "RHS: " << RHS << "\n");
 
   CaseRange &Pivot = *(Begin + Mid);
-  DEBUG(dbgs() << "Pivot ==> "
-               << Pivot.Low->getValue()
-               << " -" << Pivot.High->getValue() << "\n");
+  LLVM_DEBUG(dbgs() << "Pivot ==> " << Pivot.Low->getValue() << " -"
+                    << Pivot.High->getValue() << "\n");
 
   // NewLowerBound here should never be the integer minimal value.
   // This is because it is computed from a case range that is never
@@ -274,20 +270,14 @@ LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound,
       NewUpperBound = LHS.back().High;
   }
 
-  DEBUG(dbgs() << "LHS Bounds ==> ";
-        if (LowerBound) {
-          dbgs() << LowerBound->getSExtValue();
-        } else {
-          dbgs() << "NONE";
-        }
-        dbgs() << " - " << NewUpperBound->getSExtValue() << "\n";
-        dbgs() << "RHS Bounds ==> ";
-        dbgs() << NewLowerBound->getSExtValue() << " - ";
-        if (UpperBound) {
-          dbgs() << UpperBound->getSExtValue() << "\n";
-        } else {
-          dbgs() << "NONE\n";
-        });
+  LLVM_DEBUG(dbgs() << "LHS Bounds ==> "; if (LowerBound) {
+    dbgs() << LowerBound->getSExtValue();
+  } else { dbgs() << "NONE"; } dbgs() << " - "
+                                      << NewUpperBound->getSExtValue() << "\n";
+             dbgs() << "RHS Bounds ==> ";
+             dbgs() << NewLowerBound->getSExtValue() << " - "; if (UpperBound) {
+               dbgs() << UpperBound->getSExtValue() << "\n";
+             } else { dbgs() << "NONE\n"; });
 
   // Create a new node that checks if the value is < pivot. Go to the
   // left branch if it is and right branch if not.
@@ -337,7 +327,7 @@ BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val,
     } else if (Leaf.Low->isZero()) {
       // Val >= 0 && Val <= Hi --> Val <=u Hi
       Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_ULE, Val, Leaf.High,
-                          "SwitchLeaf");      
+                          "SwitchLeaf");
     } else {
       // Emit V-Lo <=u Hi-Lo
       Constant* NegLo = ConstantExpr::getNeg(Leaf.Low);
@@ -364,7 +354,7 @@ BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val,
     for (uint64_t j = 0; j < Range; ++j) {
       PN->removeIncomingValue(OrigBlock);
     }
-    
+
     int BlockIdx = PN->getBasicBlockIndex(OrigBlock);
     assert(BlockIdx != -1 && "Switch didn't go to this successor??");
     PN->setIncomingBlock((unsigned)BlockIdx, NewLeaf);
@@ -382,7 +372,7 @@ unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) {
     Cases.push_back(CaseRange(Case.getCaseValue(), Case.getCaseValue(),
                               Case.getCaseSuccessor()));
 
-  std::sort(Cases.begin(), Cases.end(), CaseCmp());
+  llvm::sort(Cases.begin(), Cases.end(), CaseCmp());
 
   // Merge case into clusters
   if (Cases.size() >= 2) {
@@ -443,9 +433,9 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI,
   // Prepare cases vector.
   CaseVector Cases;
   unsigned numCmps = Clusterify(Cases, SI);
-  DEBUG(dbgs() << "Clusterify finished. Total clusters: " << Cases.size()
-               << ". Total compares: " << numCmps << "\n");
-  DEBUG(dbgs() << "Cases: " << Cases << "\n");
+  LLVM_DEBUG(dbgs() << "Clusterify finished. Total clusters: " << Cases.size()
+                    << ". Total compares: " << numCmps << "\n");
+  LLVM_DEBUG(dbgs() << "Cases: " << Cases << "\n");
   (void)numCmps;
 
   ConstantInt *LowerBound = nullptr;
@@ -505,6 +495,10 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI,
     }
 #endif
 
+    // As the default block in the switch is unreachable, update the PHI nodes
+    // (remove the entry to the default block) to reflect this.
+    Default->removePredecessor(OrigBlock);
+
     // Use the most popular block as the new default, reducing the number of
     // cases.
     assert(MaxPop > 0 && PopSucc);
@@ -518,29 +512,33 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI,
     if (Cases.empty()) {
       BranchInst::Create(Default, CurBlock);
       SI->eraseFromParent();
+      // As all the cases have been replaced with a single branch, only keep
+      // one entry in the PHI nodes.
+      for (unsigned I = 0 ; I < (MaxPop - 1) ; ++I)
+        PopSucc->removePredecessor(OrigBlock);
       return;
     }
   }
 
+  unsigned NrOfDefaults = (SI->getDefaultDest() == Default) ? 1 : 0;
+  for (const auto &Case : SI->cases())
+    if (Case.getCaseSuccessor() == Default)
+      NrOfDefaults++;
+
   // Create a new, empty default block so that the new hierarchy of
   // if-then statements go to this and the PHI nodes are happy.
   BasicBlock *NewDefault = BasicBlock::Create(SI->getContext(), "NewDefault");
   F->getBasicBlockList().insert(Default->getIterator(), NewDefault);
   BranchInst::Create(Default, NewDefault);
 
-  // If there is an entry in any PHI nodes for the default edge, make sure
-  // to update them as well.
-  for (BasicBlock::iterator I = Default->begin(); isa<PHINode>(I); ++I) {
-    PHINode *PN = cast<PHINode>(I);
-    int BlockIdx = PN->getBasicBlockIndex(OrigBlock);
-    assert(BlockIdx != -1 && "Switch didn't go to this successor??");
-    PN->setIncomingBlock((unsigned)BlockIdx, NewDefault);
-  }
-
   BasicBlock *SwitchBlock =
       switchConvert(Cases.begin(), Cases.end(), LowerBound, UpperBound, Val,
                     OrigBlock, OrigBlock, NewDefault, UnreachableRanges);
 
+  // If there are entries in any PHI nodes for the default edge, make sure
+  // to update them as well.
+  fixPhis(Default, OrigBlock, NewDefault, NrOfDefaults);
+
   // Branch to our shiny new if-then stuff...
   BranchInst::Create(SwitchBlock, OrigBlock);
 
diff --git a/lib/Transforms/Utils/Mem2Reg.cpp b/lib/Transforms/Utils/Mem2Reg.cpp
index 29f289b62da0..23145e584751 100644
--- a/lib/Transforms/Utils/Mem2Reg.cpp
+++ b/lib/Transforms/Utils/Mem2Reg.cpp
@@ -22,7 +22,7 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include <vector>
 
diff --git a/lib/Transforms/Utils/MetaRenamer.cpp b/lib/Transforms/Utils/MetaRenamer.cpp
index 0f7bd76c03ca..323f2552ca80 100644
--- a/lib/Transforms/Utils/MetaRenamer.cpp
+++ b/lib/Transforms/Utils/MetaRenamer.cpp
@@ -29,7 +29,7 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/TypeFinder.h"
 #include "llvm/Pass.h"
-#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils.h"
 
 using namespace llvm;
 
diff --git a/lib/Transforms/Utils/OrderedInstructions.cpp b/lib/Transforms/Utils/OrderedInstructions.cpp
index dc780542ce68..6d0b96f6aa8a 100644
--- a/lib/Transforms/Utils/OrderedInstructions.cpp
+++ b/lib/Transforms/Utils/OrderedInstructions.cpp
@@ -14,19 +14,38 @@
 #include "llvm/Transforms/Utils/OrderedInstructions.h"
 using namespace llvm;
 
+bool OrderedInstructions::localDominates(const Instruction *InstA,
+                                         const Instruction *InstB) const {
+  assert(InstA->getParent() == InstB->getParent() &&
+         "Instructions must be in the same basic block");
+
+  const BasicBlock *IBB = InstA->getParent();
+  auto OBB = OBBMap.find(IBB);
+  if (OBB == OBBMap.end())
+    OBB = OBBMap.insert({IBB, make_unique<OrderedBasicBlock>(IBB)}).first;
+  return OBB->second->dominates(InstA, InstB);
+}
+
 /// Given 2 instructions, use OrderedBasicBlock to check for dominance relation
 /// if the instructions are in the same basic block, Otherwise, use dominator
 /// tree.
 bool OrderedInstructions::dominates(const Instruction *InstA,
                                     const Instruction *InstB) const {
-  const BasicBlock *IBB = InstA->getParent();
   // Use ordered basic block to do dominance check in case the 2 instructions
   // are in the same basic block.
-  if (IBB == InstB->getParent()) {
-    auto OBB = OBBMap.find(IBB);
-    if (OBB == OBBMap.end())
-      OBB = OBBMap.insert({IBB, make_unique<OrderedBasicBlock>(IBB)}).first;
-    return OBB->second->dominates(InstA, InstB);
-  }
+  if (InstA->getParent() == InstB->getParent())
+    return localDominates(InstA, InstB);
   return DT->dominates(InstA->getParent(), InstB->getParent());
 }
+
+bool OrderedInstructions::dfsBefore(const Instruction *InstA,
+                                    const Instruction *InstB) const {
+  // Use ordered basic block in case the 2 instructions are in the same basic
+  // block.
+  if (InstA->getParent() == InstB->getParent())
+    return localDominates(InstA, InstB);
+
+  DomTreeNode *DA = DT->getNode(InstA->getParent());
+  DomTreeNode *DB = DT->getNode(InstB->getParent());
+  return DA->getDFSNumIn() < DB->getDFSNumIn();
+}
diff --git a/lib/Transforms/Utils/PredicateInfo.cpp b/lib/Transforms/Utils/PredicateInfo.cpp
index d47be6ea566b..2923977b791a 100644
--- a/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/lib/Transforms/Utils/PredicateInfo.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
@@ -24,6 +25,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
@@ -32,7 +34,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/DebugCounter.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/OrderedInstructions.h"
 #include <algorithm>
 #define DEBUG_TYPE "predicateinfo"
@@ -118,7 +120,7 @@ static bool valueComesBefore(OrderedInstructions &OI, const Value *A,
     return false;
   if (ArgA && ArgB)
     return ArgA->getArgNo() < ArgB->getArgNo();
-  return OI.dominates(cast<Instruction>(A), cast<Instruction>(B));
+  return OI.dfsBefore(cast<Instruction>(A), cast<Instruction>(B));
 }
 
 // This compares ValueDFS structures, creating OrderedBasicBlocks where
@@ -479,6 +481,19 @@ void PredicateInfo::buildPredicateInfo() {
   renameUses(OpsToRename);
 }
 
+// Create a ssa_copy declaration with custom mangling, because
+// Intrinsic::getDeclaration does not handle overloaded unnamed types properly:
+// all unnamed types get mangled to the same string. We use the pointer
+// to the type as name here, as it guarantees unique names for different
+// types and we remove the declarations when destroying PredicateInfo.
+// It is a workaround for PR38117, because solving it in a fully general way is
+// tricky (FIXME).
+static Function *getCopyDeclaration(Module *M, Type *Ty) {
+  std::string Name = "llvm.ssa.copy." + utostr((uintptr_t) Ty);
+  return cast<Function>(M->getOrInsertFunction(
+      Name, getType(M->getContext(), Intrinsic::ssa_copy, Ty)));
+}
+
 // Given the renaming stack, make all the operands currently on the stack real
 // by inserting them into the IR.  Return the last operation's value.
 Value *PredicateInfo::materializeStack(unsigned int &Counter,
@@ -507,8 +522,9 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter,
     // order in the case of multiple predicateinfo in the same block.
     if (isa<PredicateWithEdge>(ValInfo)) {
       IRBuilder<> B(getBranchTerminator(ValInfo));
-      Function *IF = Intrinsic::getDeclaration(
-          F.getParent(), Intrinsic::ssa_copy, Op->getType());
+      Function *IF = getCopyDeclaration(F.getParent(), Op->getType());
+      if (IF->user_begin() == IF->user_end())
+        CreatedDeclarations.insert(IF);
       CallInst *PIC =
           B.CreateCall(IF, Op, Op->getName() + "." + Twine(Counter++));
       PredicateMap.insert({PIC, ValInfo});
@@ -518,8 +534,9 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter,
       assert(PAssume &&
              "Should not have gotten here without it being an assume");
       IRBuilder<> B(PAssume->AssumeInst);
-      Function *IF = Intrinsic::getDeclaration(
-          F.getParent(), Intrinsic::ssa_copy, Op->getType());
+      Function *IF = getCopyDeclaration(F.getParent(), Op->getType());
+      if (IF->user_begin() == IF->user_end())
+        CreatedDeclarations.insert(IF);
       CallInst *PIC = B.CreateCall(IF, Op);
       PredicateMap.insert({PIC, ValInfo});
       Result.Def = PIC;
@@ -553,10 +570,11 @@ void PredicateInfo::renameUses(SmallPtrSetImpl<Value *> &OpSet) {
   auto Comparator = [&](const Value *A, const Value *B) {
     return valueComesBefore(OI, A, B);
   };
-  std::sort(OpsToRename.begin(), OpsToRename.end(), Comparator);
+  llvm::sort(OpsToRename.begin(), OpsToRename.end(), Comparator);
   ValueDFS_Compare Compare(OI);
   // Compute liveness, and rename in O(uses) per Op.
   for (auto *Op : OpsToRename) {
+    LLVM_DEBUG(dbgs() << "Visiting " << *Op << "\n");
     unsigned Counter = 0;
     SmallVector<ValueDFS, 16> OrderedUses;
     const auto &ValueInfo = getValueInfo(Op);
@@ -625,15 +643,15 @@ void PredicateInfo::renameUses(SmallPtrSetImpl<Value *> &OpSet) {
       // we want to.
       bool PossibleCopy = VD.PInfo != nullptr;
       if (RenameStack.empty()) {
-        DEBUG(dbgs() << "Rename Stack is empty\n");
+        LLVM_DEBUG(dbgs() << "Rename Stack is empty\n");
       } else {
-        DEBUG(dbgs() << "Rename Stack Top DFS numbers are ("
-                     << RenameStack.back().DFSIn << ","
-                     << RenameStack.back().DFSOut << ")\n");
+        LLVM_DEBUG(dbgs() << "Rename Stack Top DFS numbers are ("
+                          << RenameStack.back().DFSIn << ","
+                          << RenameStack.back().DFSOut << ")\n");
       }
 
-      DEBUG(dbgs() << "Current DFS numbers are (" << VD.DFSIn << ","
-                   << VD.DFSOut << ")\n");
+      LLVM_DEBUG(dbgs() << "Current DFS numbers are (" << VD.DFSIn << ","
+                        << VD.DFSOut << ")\n");
 
       bool ShouldPush = (VD.Def || PossibleCopy);
       bool OutOfScope = !stackIsInScope(RenameStack, VD);
@@ -652,7 +670,7 @@ void PredicateInfo::renameUses(SmallPtrSetImpl<Value *> &OpSet) {
       if (VD.Def || PossibleCopy)
         continue;
       if (!DebugCounter::shouldExecute(RenameCounter)) {
-        DEBUG(dbgs() << "Skipping execution due to debug counter\n");
+        LLVM_DEBUG(dbgs() << "Skipping execution due to debug counter\n");
         continue;
       }
       ValueDFS &Result = RenameStack.back();
@@ -663,8 +681,9 @@ void PredicateInfo::renameUses(SmallPtrSetImpl<Value *> &OpSet) {
       if (!Result.Def)
         Result.Def = materializeStack(Counter, RenameStack, Op);
 
-      DEBUG(dbgs() << "Found replacement " << *Result.Def << " for "
-                   << *VD.U->get() << " in " << *(VD.U->getUser()) << "\n");
+      LLVM_DEBUG(dbgs() << "Found replacement " << *Result.Def << " for "
+                        << *VD.U->get() << " in " << *(VD.U->getUser())
+                        << "\n");
       assert(DT.dominates(cast<Instruction>(Result.Def), *VD.U) &&
              "Predicateinfo def should have dominated this use");
       VD.U->set(Result.Def);
@@ -702,7 +721,22 @@ PredicateInfo::PredicateInfo(Function &F, DominatorTree &DT,
   buildPredicateInfo();
 }
 
-PredicateInfo::~PredicateInfo() {}
+// Remove all declarations we created . The PredicateInfo consumers are
+// responsible for remove the ssa_copy calls created.
+PredicateInfo::~PredicateInfo() {
+  // Collect function pointers in set first, as SmallSet uses a SmallVector
+  // internally and we have to remove the asserting value handles first.
+  SmallPtrSet<Function *, 20> FunctionPtrs;
+  for (auto &F : CreatedDeclarations)
+    FunctionPtrs.insert(&*F);
+  CreatedDeclarations.clear();
+
+  for (Function *F : FunctionPtrs) {
+    assert(F->user_begin() == F->user_end() &&
+           "PredicateInfo consumer did not remove all SSA copies.");
+    F->eraseFromParent();
+  }
+}
 
 void PredicateInfo::verifyPredicateInfo() const {}
 
@@ -720,6 +754,20 @@ void PredicateInfoPrinterLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<AssumptionCacheTracker>();
 }
 
+// Replace ssa_copy calls created by PredicateInfo with their operand.
+static void replaceCreatedSSACopys(PredicateInfo &PredInfo, Function &F) {
+  for (auto I = inst_begin(F), E = inst_end(F); I != E;) {
+    Instruction *Inst = &*I++;
+    const auto *PI = PredInfo.getPredicateInfoFor(Inst);
+    auto *II = dyn_cast<IntrinsicInst>(Inst);
+    if (!PI || !II || II->getIntrinsicID() != Intrinsic::ssa_copy)
+      continue;
+
+    Inst->replaceAllUsesWith(II->getOperand(0));
+    Inst->eraseFromParent();
+  }
+}
+
 bool PredicateInfoPrinterLegacyPass::runOnFunction(Function &F) {
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
@@ -727,6 +775,8 @@ bool PredicateInfoPrinterLegacyPass::runOnFunction(Function &F) {
   PredInfo->print(dbgs());
   if (VerifyPredicateInfo)
     PredInfo->verifyPredicateInfo();
+
+  replaceCreatedSSACopys(*PredInfo, F);
   return false;
 }
 
@@ -735,12 +785,14 @@ PreservedAnalyses PredicateInfoPrinterPass::run(Function &F,
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
   OS << "PredicateInfo for function: " << F.getName() << "\n";
-  make_unique<PredicateInfo>(F, DT, AC)->print(OS);
+  auto PredInfo = make_unique<PredicateInfo>(F, DT, AC);
+  PredInfo->print(OS);
 
+  replaceCreatedSSACopys(*PredInfo, F);
   return PreservedAnalyses::all();
 }
 
-/// \brief An assembly annotator class to print PredicateInfo information in
+/// An assembly annotator class to print PredicateInfo information in
 /// comments.
 class PredicateInfoAnnotatedWriter : public AssemblyAnnotationWriter {
   friend class PredicateInfo;
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index fcd3bd08482a..86e15bbd7f22 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -45,7 +46,6 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include <algorithm>
 #include <cassert>
@@ -164,26 +164,27 @@ struct AllocaInfo {
   }
 };
 
-// Data package used by RenamePass()
-class RenamePassData {
-public:
+/// Data package used by RenamePass().
+struct RenamePassData {
   using ValVector = std::vector<Value *>;
+  using LocationVector = std::vector<DebugLoc>;
 
-  RenamePassData(BasicBlock *B, BasicBlock *P, ValVector V)
-      : BB(B), Pred(P), Values(std::move(V)) {}
+  RenamePassData(BasicBlock *B, BasicBlock *P, ValVector V, LocationVector L)
+      : BB(B), Pred(P), Values(std::move(V)), Locations(std::move(L)) {}
 
   BasicBlock *BB;
   BasicBlock *Pred;
   ValVector Values;
+  LocationVector Locations;
 };
 
-/// \brief This assigns and keeps a per-bb relative ordering of load/store
+/// This assigns and keeps a per-bb relative ordering of load/store
 /// instructions in the block that directly load or store an alloca.
 ///
 /// This functionality is important because it avoids scanning large basic
 /// blocks multiple times when promoting many allocas in the same block.
 class LargeBlockInfo {
-  /// \brief For each instruction that we track, keep the index of the
+  /// For each instruction that we track, keep the index of the
   /// instruction.
   ///
   /// The index starts out as the number of the instruction from the start of
@@ -242,7 +243,7 @@ struct PromoteMem2Reg {
   /// Reverse mapping of Allocas.
   DenseMap<AllocaInst *, unsigned> AllocaLookup;
 
-  /// \brief The PhiNodes we're adding.
+  /// The PhiNodes we're adding.
   ///
   /// That map is used to simplify some Phi nodes as we iterate over it, so
   /// it should have deterministic iterators.  We could use a MapVector, but
@@ -294,7 +295,7 @@ private:
   unsigned getNumPreds(const BasicBlock *BB) {
     unsigned &NP = BBNumPreds[BB];
     if (NP == 0)
-      NP = std::distance(pred_begin(BB), pred_end(BB)) + 1;
+      NP = pred_size(BB) + 1;
     return NP - 1;
   }
 
@@ -303,6 +304,7 @@ private:
                            SmallPtrSetImpl<BasicBlock *> &LiveInBlocks);
   void RenamePass(BasicBlock *BB, BasicBlock *Pred,
                   RenamePassData::ValVector &IncVals,
+                  RenamePassData::LocationVector &IncLocs,
                   std::vector<RenamePassData> &Worklist);
   bool QueuePhiNode(BasicBlock *BB, unsigned AllocaIdx, unsigned &Version);
 };
@@ -345,7 +347,7 @@ static void removeLifetimeIntrinsicUsers(AllocaInst *AI) {
   }
 }
 
-/// \brief Rewrite as many loads as possible given a single store.
+/// Rewrite as many loads as possible given a single store.
 ///
 /// When there is only a single store, we can use the domtree to trivially
 /// replace all of the dominated loads with the stored value. Do so, and return
@@ -475,7 +477,7 @@ static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
 
   // Sort the stores by their index, making it efficient to do a lookup with a
   // binary search.
-  std::sort(StoresByIndex.begin(), StoresByIndex.end(), less_first());
+  llvm::sort(StoresByIndex.begin(), StoresByIndex.end(), less_first());
 
   // Walk all of the loads from this alloca, replacing them with the nearest
   // store above them, if any.
@@ -509,6 +511,11 @@ static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
           !isKnownNonZero(ReplVal, DL, 0, AC, LI, &DT))
         addAssumeNonNull(AC, LI);
 
+      // If the replacement value is the load, this must occur in unreachable
+      // code.
+      if (ReplVal == LI)
+        ReplVal = UndefValue::get(LI->getType());
+
       LI->replaceAllUsesWith(ReplVal);
     }
 
@@ -631,10 +638,10 @@ void PromoteMem2Reg::run() {
     SmallVector<BasicBlock *, 32> PHIBlocks;
     IDF.calculate(PHIBlocks);
     if (PHIBlocks.size() > 1)
-      std::sort(PHIBlocks.begin(), PHIBlocks.end(),
-                [this](BasicBlock *A, BasicBlock *B) {
-                  return BBNumbers.lookup(A) < BBNumbers.lookup(B);
-                });
+      llvm::sort(PHIBlocks.begin(), PHIBlocks.end(),
+                 [this](BasicBlock *A, BasicBlock *B) {
+                   return BBNumbers.lookup(A) < BBNumbers.lookup(B);
+                 });
 
     unsigned CurrentVersion = 0;
     for (BasicBlock *BB : PHIBlocks)
@@ -653,15 +660,20 @@ void PromoteMem2Reg::run() {
   for (unsigned i = 0, e = Allocas.size(); i != e; ++i)
     Values[i] = UndefValue::get(Allocas[i]->getAllocatedType());
 
+  // When handling debug info, treat all incoming values as if they have unknown
+  // locations until proven otherwise.
+  RenamePassData::LocationVector Locations(Allocas.size());
+
   // Walks all basic blocks in the function performing the SSA rename algorithm
   // and inserting the phi nodes we marked as necessary
   std::vector<RenamePassData> RenamePassWorkList;
-  RenamePassWorkList.emplace_back(&F.front(), nullptr, std::move(Values));
+  RenamePassWorkList.emplace_back(&F.front(), nullptr, std::move(Values),
+                                  std::move(Locations));
   do {
     RenamePassData RPD = std::move(RenamePassWorkList.back());
     RenamePassWorkList.pop_back();
     // RenamePass may add new worklist entries.
-    RenamePass(RPD.BB, RPD.Pred, RPD.Values, RenamePassWorkList);
+    RenamePass(RPD.BB, RPD.Pred, RPD.Values, RPD.Locations, RenamePassWorkList);
   } while (!RenamePassWorkList.empty());
 
   // The renamer uses the Visited set to avoid infinite loops.  Clear it now.
@@ -740,7 +752,7 @@ void PromoteMem2Reg::run() {
     // Ok, now we know that all of the PHI nodes are missing entries for some
     // basic blocks.  Start by sorting the incoming predecessors for efficient
     // access.
-    std::sort(Preds.begin(), Preds.end());
+    llvm::sort(Preds.begin(), Preds.end());
 
     // Now we loop through all BB's which have entries in SomePHI and remove
     // them from the Preds list.
@@ -772,7 +784,7 @@ void PromoteMem2Reg::run() {
   NewPhiNodes.clear();
 }
 
-/// \brief Determine which blocks the value is live in.
+/// Determine which blocks the value is live in.
 ///
 /// These are blocks which lead to uses.  Knowing this allows us to avoid
 /// inserting PHI nodes into blocks which don't lead to uses (thus, the
@@ -846,7 +858,7 @@ void PromoteMem2Reg::ComputeLiveInBlocks(
   }
 }
 
-/// \brief Queue a phi-node to be added to a basic-block for a specific Alloca.
+/// Queue a phi-node to be added to a basic-block for a specific Alloca.
 ///
 /// Returns true if there wasn't already a phi-node for that variable
 bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo,
@@ -868,13 +880,24 @@ bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo,
   return true;
 }
 
-/// \brief Recursively traverse the CFG of the function, renaming loads and
+/// Update the debug location of a phi. \p ApplyMergedLoc indicates whether to
+/// create a merged location incorporating \p DL, or to set \p DL directly.
+static void updateForIncomingValueLocation(PHINode *PN, DebugLoc DL,
+                                           bool ApplyMergedLoc) {
+  if (ApplyMergedLoc)
+    PN->applyMergedLocation(PN->getDebugLoc(), DL);
+  else
+    PN->setDebugLoc(DL);
+}
+
+/// Recursively traverse the CFG of the function, renaming loads and
 /// stores to the allocas which we are promoting.
 ///
 /// IncomingVals indicates what value each Alloca contains on exit from the
 /// predecessor block Pred.
 void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred,
                                 RenamePassData::ValVector &IncomingVals,
+                                RenamePassData::LocationVector &IncomingLocs,
                                 std::vector<RenamePassData> &Worklist) {
 NextIteration:
   // If we are inserting any phi nodes into this BB, they will already be in the
@@ -899,6 +922,10 @@ NextIteration:
       do {
         unsigned AllocaNo = PhiToAllocaMap[APN];
 
+        // Update the location of the phi node.
+        updateForIncomingValueLocation(APN, IncomingLocs[AllocaNo],
+                                       APN->getNumIncomingValues() > 0);
+
         // Add N incoming values to the PHI node.
         for (unsigned i = 0; i != NumEdges; ++i)
           APN->addIncoming(IncomingVals[AllocaNo], Pred);
@@ -960,8 +987,11 @@ NextIteration:
         continue;
 
       // what value were we writing?
-      IncomingVals[ai->second] = SI->getOperand(0);
+      unsigned AllocaNo = ai->second;
+      IncomingVals[AllocaNo] = SI->getOperand(0);
+
       // Record debuginfo for the store before removing it.
+      IncomingLocs[AllocaNo] = SI->getDebugLoc();
       for (DbgInfoIntrinsic *DII : AllocaDbgDeclares[ai->second])
         ConvertDebugDeclareToDebugValue(DII, SI, DIB);
       BB->getInstList().erase(SI);
@@ -984,7 +1014,7 @@ NextIteration:
 
   for (; I != E; ++I)
     if (VisitedSuccs.insert(*I).second)
-      Worklist.emplace_back(*I, Pred, IncomingVals);
+      Worklist.emplace_back(*I, Pred, IncomingVals, IncomingLocs);
 
   goto NextIteration;
 }
diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp
index e4b20b0faa15..ca184ed7c4e3 100644
--- a/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/lib/Transforms/Utils/SSAUpdater.cpp
@@ -147,11 +147,9 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
   if (isa<PHINode>(BB->begin())) {
     SmallDenseMap<BasicBlock *, Value *, 8> ValueMapping(PredValues.begin(),
                                                          PredValues.end());
-    PHINode *SomePHI;
-    for (BasicBlock::iterator It = BB->begin();
-         (SomePHI = dyn_cast<PHINode>(It)); ++It) {
-      if (IsEquivalentPHI(SomePHI, ValueMapping))
-        return SomePHI;
+    for (PHINode &SomePHI : BB->phis()) {
+      if (IsEquivalentPHI(&SomePHI, ValueMapping))
+        return &SomePHI;
     }
   }
 
@@ -180,7 +178,7 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
   // If the client wants to know about all new instructions, tell it.
   if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI);
 
-  DEBUG(dbgs() << "  Inserted PHI: " << *InsertedPHI << "\n");
+  LLVM_DEBUG(dbgs() << "  Inserted PHI: " << *InsertedPHI << "\n");
   return InsertedPHI;
 }
 
diff --git a/lib/Transforms/Utils/SSAUpdaterBulk.cpp b/lib/Transforms/Utils/SSAUpdaterBulk.cpp
new file mode 100644
index 000000000000..397bac2940a4
--- /dev/null
+++ b/lib/Transforms/Utils/SSAUpdaterBulk.cpp
@@ -0,0 +1,191 @@
+//===- SSAUpdaterBulk.cpp - Unstructured SSA Update Tool ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SSAUpdaterBulk class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SSAUpdaterBulk.h"
+#include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ssaupdaterbulk"
+
+/// Helper function for finding a block which should have a value for the given
+/// user. For PHI-nodes this block is the corresponding predecessor, for other
+/// instructions it's their parent block.
+static BasicBlock *getUserBB(Use *U) {
+  auto *User = cast<Instruction>(U->getUser());
+
+  if (auto *UserPN = dyn_cast<PHINode>(User))
+    return UserPN->getIncomingBlock(*U);
+  else
+    return User->getParent();
+}
+
+/// Add a new variable to the SSA rewriter. This needs to be called before
+/// AddAvailableValue or AddUse calls.
+unsigned SSAUpdaterBulk::AddVariable(StringRef Name, Type *Ty) {
+  unsigned Var = Rewrites.size();
+  LLVM_DEBUG(dbgs() << "SSAUpdater: Var=" << Var << ": initialized with Ty = "
+                    << *Ty << ", Name = " << Name << "\n");
+  RewriteInfo RI(Name, Ty);
+  Rewrites.push_back(RI);
+  return Var;
+}
+
+/// Indicate that a rewritten value is available in the specified block with the
+/// specified value.
+void SSAUpdaterBulk::AddAvailableValue(unsigned Var, BasicBlock *BB, Value *V) {
+  assert(Var < Rewrites.size() && "Variable not found!");
+  LLVM_DEBUG(dbgs() << "SSAUpdater: Var=" << Var
+                    << ": added new available value" << *V << " in "
+                    << BB->getName() << "\n");
+  Rewrites[Var].Defines[BB] = V;
+}
+
+/// Record a use of the symbolic value. This use will be updated with a
+/// rewritten value when RewriteAllUses is called.
+void SSAUpdaterBulk::AddUse(unsigned Var, Use *U) {
+  assert(Var < Rewrites.size() && "Variable not found!");
+  LLVM_DEBUG(dbgs() << "SSAUpdater: Var=" << Var << ": added a use" << *U->get()
+                    << " in " << getUserBB(U)->getName() << "\n");
+  Rewrites[Var].Uses.push_back(U);
+}
+
+/// Return true if the SSAUpdater already has a value for the specified variable
+/// in the specified block.
+bool SSAUpdaterBulk::HasValueForBlock(unsigned Var, BasicBlock *BB) {
+  return (Var < Rewrites.size()) ? Rewrites[Var].Defines.count(BB) : false;
+}
+
+// Compute value at the given block BB. We either should already know it, or we
+// should be able to recursively reach it going up dominator tree.
+Value *SSAUpdaterBulk::computeValueAt(BasicBlock *BB, RewriteInfo &R,
+                                      DominatorTree *DT) {
+  if (!R.Defines.count(BB)) {
+    if (DT->isReachableFromEntry(BB) && PredCache.get(BB).size()) {
+      BasicBlock *IDom = DT->getNode(BB)->getIDom()->getBlock();
+      Value *V = computeValueAt(IDom, R, DT);
+      R.Defines[BB] = V;
+    } else
+      R.Defines[BB] = UndefValue::get(R.Ty);
+  }
+  return R.Defines[BB];
+}
+
+/// Given sets of UsingBlocks and DefBlocks, compute the set of LiveInBlocks.
+/// This is basically a subgraph limited by DefBlocks and UsingBlocks.
+static void
+ComputeLiveInBlocks(const SmallPtrSetImpl<BasicBlock *> &UsingBlocks,
+                    const SmallPtrSetImpl<BasicBlock *> &DefBlocks,
+                    SmallPtrSetImpl<BasicBlock *> &LiveInBlocks,
+                    PredIteratorCache &PredCache) {
+  // To determine liveness, we must iterate through the predecessors of blocks
+  // where the def is live.  Blocks are added to the worklist if we need to
+  // check their predecessors.  Start with all the using blocks.
+  SmallVector<BasicBlock *, 64> LiveInBlockWorklist(UsingBlocks.begin(),
+                                                    UsingBlocks.end());
+
+  // Now that we have a set of blocks where the phi is live-in, recursively add
+  // their predecessors until we find the full region the value is live.
+  while (!LiveInBlockWorklist.empty()) {
+    BasicBlock *BB = LiveInBlockWorklist.pop_back_val();
+
+    // The block really is live in here, insert it into the set.  If already in
+    // the set, then it has already been processed.
+    if (!LiveInBlocks.insert(BB).second)
+      continue;
+
+    // Since the value is live into BB, it is either defined in a predecessor or
+    // live into it to.  Add the preds to the worklist unless they are a
+    // defining block.
+    for (BasicBlock *P : PredCache.get(BB)) {
+      // The value is not live into a predecessor if it defines the value.
+      if (DefBlocks.count(P))
+        continue;
+
+      // Otherwise it is, add to the worklist.
+      LiveInBlockWorklist.push_back(P);
+    }
+  }
+}
+
+/// Perform all the necessary updates, including new PHI-nodes insertion and the
+/// requested uses update.
+void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
+                                    SmallVectorImpl<PHINode *> *InsertedPHIs) {
+  for (auto &R : Rewrites) {
+    // Compute locations for new phi-nodes.
+    // For that we need to initialize DefBlocks from definitions in R.Defines,
+    // UsingBlocks from uses in R.Uses, then compute LiveInBlocks, and then use
+    // this set for computing iterated dominance frontier (IDF).
+    // The IDF blocks are the blocks where we need to insert new phi-nodes.
+    ForwardIDFCalculator IDF(*DT);
+    LLVM_DEBUG(dbgs() << "SSAUpdater: rewriting " << R.Uses.size()
+                      << " use(s)\n");
+
+    SmallPtrSet<BasicBlock *, 2> DefBlocks;
+    for (auto &Def : R.Defines)
+      DefBlocks.insert(Def.first);
+    IDF.setDefiningBlocks(DefBlocks);
+
+    SmallPtrSet<BasicBlock *, 2> UsingBlocks;
+    for (Use *U : R.Uses)
+      UsingBlocks.insert(getUserBB(U));
+
+    SmallVector<BasicBlock *, 32> IDFBlocks;
+    SmallPtrSet<BasicBlock *, 32> LiveInBlocks;
+    ComputeLiveInBlocks(UsingBlocks, DefBlocks, LiveInBlocks, PredCache);
+    IDF.resetLiveInBlocks();
+    IDF.setLiveInBlocks(LiveInBlocks);
+    IDF.calculate(IDFBlocks);
+
+    // We've computed IDF, now insert new phi-nodes there.
+    SmallVector<PHINode *, 4> InsertedPHIsForVar;
+    for (auto *FrontierBB : IDFBlocks) {
+      IRBuilder<> B(FrontierBB, FrontierBB->begin());
+      PHINode *PN = B.CreatePHI(R.Ty, 0, R.Name);
+      R.Defines[FrontierBB] = PN;
+      InsertedPHIsForVar.push_back(PN);
+      if (InsertedPHIs)
+        InsertedPHIs->push_back(PN);
+    }
+
+    // Fill in arguments of the inserted PHIs.
+    for (auto *PN : InsertedPHIsForVar) {
+      BasicBlock *PBB = PN->getParent();
+      for (BasicBlock *Pred : PredCache.get(PBB))
+        PN->addIncoming(computeValueAt(Pred, R, DT), Pred);
+    }
+
+    // Rewrite actual uses with the inserted definitions.
+    SmallPtrSet<Use *, 4> ProcessedUses;
+    for (Use *U : R.Uses) {
+      if (!ProcessedUses.insert(U).second)
+        continue;
+      Value *V = computeValueAt(getUserBB(U), R, DT);
+      Value *OldVal = U->get();
+      assert(OldVal && "Invalid use!");
+      // Notify that users of the existing value that it is being replaced.
+      if (OldVal != V && OldVal->hasValueHandle())
+        ValueHandleBase::ValueIsRAUWd(OldVal, V);
+      LLVM_DEBUG(dbgs() << "SSAUpdater: replacing " << *OldVal << " with " << *V
+                        << "\n");
+      U->set(V);
+    }
+  }
+}
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index e7358dbcb624..c87b5c16ffce 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -19,7 +19,6 @@
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
@@ -28,6 +27,7 @@
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -66,7 +66,6 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
 #include <cassert>
@@ -283,12 +282,8 @@ isProfitableToFoldUnconditional(BranchInst *SI1, BranchInst *SI2,
 /// of Succ.
 static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred,
                                   BasicBlock *ExistPred) {
-  if (!isa<PHINode>(Succ->begin()))
-    return; // Quick exit if nothing to do
-
-  PHINode *PN;
-  for (BasicBlock::iterator I = Succ->begin(); (PN = dyn_cast<PHINode>(I)); ++I)
-    PN->addIncoming(PN->getIncomingValueForBlock(ExistPred), NewPred);
+  for (PHINode &PN : Succ->phis())
+    PN.addIncoming(PN.getIncomingValueForBlock(ExistPred), NewPred);
 }
 
 /// Compute an abstract "cost" of speculating the given instruction,
@@ -692,9 +687,7 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) {
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
     // Do not permit merging of large switch instructions into their
     // predecessors unless there is only one predecessor.
-    if (SI->getNumSuccessors() * std::distance(pred_begin(SI->getParent()),
-                                               pred_end(SI->getParent())) <=
-        128)
+    if (SI->getNumSuccessors() * pred_size(SI->getParent()) <= 128)
       CV = SI->getCondition();
   } else if (BranchInst *BI = dyn_cast<BranchInst>(TI))
     if (BI->isConditional() && BI->getCondition()->hasOneUse())
@@ -851,9 +844,9 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
       // Remove PHI node entries for the dead edge.
       ThisCases[0].Dest->removePredecessor(TI->getParent());
 
-      DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
-                   << "Through successor TI: " << *TI << "Leaving: " << *NI
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
+                        << "Through successor TI: " << *TI << "Leaving: " << *NI
+                        << "\n");
 
       EraseTerminatorInstAndDCECond(TI);
       return true;
@@ -865,8 +858,8 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
     for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
       DeadCases.insert(PredCases[i].Value);
 
-    DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
-                 << "Through successor TI: " << *TI);
+    LLVM_DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
+                      << "Through successor TI: " << *TI);
 
     // Collect branch weights into a vector.
     SmallVector<uint32_t, 8> Weights;
@@ -892,7 +885,7 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
     if (HasWeight && Weights.size() >= 2)
       setBranchWeights(SI, Weights);
 
-    DEBUG(dbgs() << "Leaving: " << *TI << "\n");
+    LLVM_DEBUG(dbgs() << "Leaving: " << *TI << "\n");
     return true;
   }
 
@@ -933,9 +926,9 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
   Instruction *NI = Builder.CreateBr(TheRealDest);
   (void)NI;
 
-  DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
-               << "Through successor TI: " << *TI << "Leaving: " << *NI
-               << "\n");
+  LLVM_DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
+                    << "Through successor TI: " << *TI << "Leaving: " << *NI
+                    << "\n");
 
   EraseTerminatorInstAndDCECond(TI);
   return true;
@@ -1228,11 +1221,9 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
 static bool isSafeToHoistInvoke(BasicBlock *BB1, BasicBlock *BB2,
                                 Instruction *I1, Instruction *I2) {
   for (BasicBlock *Succ : successors(BB1)) {
-    PHINode *PN;
-    for (BasicBlock::iterator BBI = Succ->begin();
-         (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
-      Value *BB1V = PN->getIncomingValueForBlock(BB1);
-      Value *BB2V = PN->getIncomingValueForBlock(BB2);
+    for (const PHINode &PN : Succ->phis()) {
+      Value *BB1V = PN.getIncomingValueForBlock(BB1);
+      Value *BB2V = PN.getIncomingValueForBlock(BB2);
       if (BB1V != BB2V && (BB1V == I1 || BB2V == I2)) {
         return false;
       }
@@ -1282,34 +1273,58 @@ static bool HoistThenElseCodeToIf(BranchInst *BI,
     if (isa<TerminatorInst>(I1))
       goto HoistTerminator;
 
+    // If we're going to hoist a call, make sure that the two instructions we're
+    // commoning/hoisting are both marked with musttail, or neither of them is
+    // marked as such. Otherwise, we might end up in a situation where we hoist
+    // from a block where the terminator is a `ret` to a block where the terminator
+    // is a `br`, and `musttail` calls expect to be followed by a return.
+    auto *C1 = dyn_cast<CallInst>(I1);
+    auto *C2 = dyn_cast<CallInst>(I2);
+    if (C1 && C2)
+      if (C1->isMustTailCall() != C2->isMustTailCall())
+        return Changed;
+
     if (!TTI.isProfitableToHoist(I1) || !TTI.isProfitableToHoist(I2))
       return Changed;
 
-    // For a normal instruction, we just move one to right before the branch,
-    // then replace all uses of the other with the first.  Finally, we remove
-    // the now redundant second instruction.
-    BIParent->getInstList().splice(BI->getIterator(), BB1->getInstList(), I1);
-    if (!I2->use_empty())
-      I2->replaceAllUsesWith(I1);
-    I1->andIRFlags(I2);
-    unsigned KnownIDs[] = {LLVMContext::MD_tbaa,
-                           LLVMContext::MD_range,
-                           LLVMContext::MD_fpmath,
-                           LLVMContext::MD_invariant_load,
-                           LLVMContext::MD_nonnull,
-                           LLVMContext::MD_invariant_group,
-                           LLVMContext::MD_align,
-                           LLVMContext::MD_dereferenceable,
-                           LLVMContext::MD_dereferenceable_or_null,
-                           LLVMContext::MD_mem_parallel_loop_access};
-    combineMetadata(I1, I2, KnownIDs);
-
-    // I1 and I2 are being combined into a single instruction.  Its debug
-    // location is the merged locations of the original instructions.
-    I1->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
-
-    I2->eraseFromParent();
-    Changed = true;
+    if (isa<DbgInfoIntrinsic>(I1) || isa<DbgInfoIntrinsic>(I2)) {
+      assert (isa<DbgInfoIntrinsic>(I1) && isa<DbgInfoIntrinsic>(I2));
+      // The debug location is an integral part of a debug info intrinsic
+      // and can't be separated from it or replaced.  Instead of attempting
+      // to merge locations, simply hoist both copies of the intrinsic.
+      BIParent->getInstList().splice(BI->getIterator(),
+                                     BB1->getInstList(), I1);
+      BIParent->getInstList().splice(BI->getIterator(),
+                                     BB2->getInstList(), I2);
+      Changed = true;
+    } else {
+      // For a normal instruction, we just move one to right before the branch,
+      // then replace all uses of the other with the first.  Finally, we remove
+      // the now redundant second instruction.
+      BIParent->getInstList().splice(BI->getIterator(),
+                                     BB1->getInstList(), I1);
+      if (!I2->use_empty())
+        I2->replaceAllUsesWith(I1);
+      I1->andIRFlags(I2);
+      unsigned KnownIDs[] = {LLVMContext::MD_tbaa,
+                             LLVMContext::MD_range,
+                             LLVMContext::MD_fpmath,
+                             LLVMContext::MD_invariant_load,
+                             LLVMContext::MD_nonnull,
+                             LLVMContext::MD_invariant_group,
+                             LLVMContext::MD_align,
+                             LLVMContext::MD_dereferenceable,
+                             LLVMContext::MD_dereferenceable_or_null,
+                             LLVMContext::MD_mem_parallel_loop_access};
+      combineMetadata(I1, I2, KnownIDs);
+
+      // I1 and I2 are being combined into a single instruction.  Its debug
+      // location is the merged locations of the original instructions.
+      I1->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
+
+      I2->eraseFromParent();
+      Changed = true;
+    }
 
     I1 = &*BB1_Itr++;
     I2 = &*BB2_Itr++;
@@ -1332,18 +1347,16 @@ HoistTerminator:
     return Changed;
 
   for (BasicBlock *Succ : successors(BB1)) {
-    PHINode *PN;
-    for (BasicBlock::iterator BBI = Succ->begin();
-         (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
-      Value *BB1V = PN->getIncomingValueForBlock(BB1);
-      Value *BB2V = PN->getIncomingValueForBlock(BB2);
+    for (PHINode &PN : Succ->phis()) {
+      Value *BB1V = PN.getIncomingValueForBlock(BB1);
+      Value *BB2V = PN.getIncomingValueForBlock(BB2);
       if (BB1V == BB2V)
         continue;
 
       // Check for passingValueIsAlwaysUndefined here because we would rather
       // eliminate undefined control flow then converting it to a select.
-      if (passingValueIsAlwaysUndefined(BB1V, PN) ||
-          passingValueIsAlwaysUndefined(BB2V, PN))
+      if (passingValueIsAlwaysUndefined(BB1V, &PN) ||
+          passingValueIsAlwaysUndefined(BB2V, &PN))
         return Changed;
 
       if (isa<ConstantExpr>(BB1V) && !isSafeToSpeculativelyExecute(BB1V))
@@ -1369,11 +1382,9 @@ HoistTerminator:
   // nodes, so we insert select instruction to compute the final result.
   std::map<std::pair<Value *, Value *>, SelectInst *> InsertedSelects;
   for (BasicBlock *Succ : successors(BB1)) {
-    PHINode *PN;
-    for (BasicBlock::iterator BBI = Succ->begin();
-         (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
-      Value *BB1V = PN->getIncomingValueForBlock(BB1);
-      Value *BB2V = PN->getIncomingValueForBlock(BB2);
+    for (PHINode &PN : Succ->phis()) {
+      Value *BB1V = PN.getIncomingValueForBlock(BB1);
+      Value *BB2V = PN.getIncomingValueForBlock(BB2);
       if (BB1V == BB2V)
         continue;
 
@@ -1386,9 +1397,9 @@ HoistTerminator:
                                  BB1V->getName() + "." + BB2V->getName(), BI));
 
       // Make the PHI node use the select for all incoming values for BB1/BB2
-      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-        if (PN->getIncomingBlock(i) == BB1 || PN->getIncomingBlock(i) == BB2)
-          PN->setIncomingValue(i, SI);
+      for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
+        if (PN.getIncomingBlock(i) == BB1 || PN.getIncomingBlock(i) == BB2)
+          PN.setIncomingValue(i, SI);
     }
   }
 
@@ -1727,7 +1738,8 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB) {
   LockstepReverseIterator LRI(UnconditionalPreds);
   while (LRI.isValid() &&
          canSinkInstructions(*LRI, PHIOperands)) {
-    DEBUG(dbgs() << "SINK: instruction can be sunk: " << *(*LRI)[0] << "\n");
+    LLVM_DEBUG(dbgs() << "SINK: instruction can be sunk: " << *(*LRI)[0]
+                      << "\n");
     InstructionsToSink.insert((*LRI).begin(), (*LRI).end());
     ++ScanIdx;
     --LRI;
@@ -1739,7 +1751,7 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB) {
       for (auto *V : PHIOperands[I])
         if (InstructionsToSink.count(V) == 0)
           ++NumPHIdValues;
-    DEBUG(dbgs() << "SINK: #phid values: " << NumPHIdValues << "\n");
+    LLVM_DEBUG(dbgs() << "SINK: #phid values: " << NumPHIdValues << "\n");
     unsigned NumPHIInsts = NumPHIdValues / UnconditionalPreds.size();
     if ((NumPHIdValues % UnconditionalPreds.size()) != 0)
         NumPHIInsts++;
@@ -1767,7 +1779,7 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB) {
     if (!Profitable)
       return false;
 
-    DEBUG(dbgs() << "SINK: Splitting edge\n");
+    LLVM_DEBUG(dbgs() << "SINK: Splitting edge\n");
     // We have a conditional edge and we're going to sink some instructions.
     // Insert a new block postdominating all blocks we're going to sink from.
     if (!SplitBlockPredecessors(BB, UnconditionalPreds, ".sink.split"))
@@ -1789,16 +1801,17 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB) {
   // and never actually sink it which means we produce more PHIs than intended.
   // This is unlikely in practice though.
   for (unsigned SinkIdx = 0; SinkIdx != ScanIdx; ++SinkIdx) {
-    DEBUG(dbgs() << "SINK: Sink: "
-                 << *UnconditionalPreds[0]->getTerminator()->getPrevNode()
-                 << "\n");
+    LLVM_DEBUG(dbgs() << "SINK: Sink: "
+                      << *UnconditionalPreds[0]->getTerminator()->getPrevNode()
+                      << "\n");
 
     // Because we've sunk every instruction in turn, the current instruction to
     // sink is always at index 0.
     LRI.reset();
     if (!ProfitableToSinkInstruction(LRI)) {
       // Too many PHIs would be created.
-      DEBUG(dbgs() << "SINK: stopping here, too many PHIs would be created!\n");
+      LLVM_DEBUG(
+          dbgs() << "SINK: stopping here, too many PHIs would be created!\n");
       break;
     }
 
@@ -1810,7 +1823,7 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB) {
   return Changed;
 }
 
-/// \brief Determine if we can hoist sink a sole store instruction out of a
+/// Determine if we can hoist sink a sole store instruction out of a
 /// conditional block.
 ///
 /// We are looking for code like the following:
@@ -1850,12 +1863,9 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
 
   // Look for a store to the same pointer in BrBB.
   unsigned MaxNumInstToLookAt = 9;
-  for (Instruction &CurI : reverse(*BrBB)) {
+  for (Instruction &CurI : reverse(BrBB->instructionsWithoutDebug())) {
     if (!MaxNumInstToLookAt)
       break;
-    // Skip debug info.
-    if (isa<DbgInfoIntrinsic>(CurI))
-      continue;
     --MaxNumInstToLookAt;
 
     // Could be calling an instruction that affects memory like free().
@@ -1874,7 +1884,7 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
   return nullptr;
 }
 
-/// \brief Speculate a conditional basic block flattening the CFG.
+/// Speculate a conditional basic block flattening the CFG.
 ///
 /// Note that this is a very risky transform currently. Speculating
 /// instructions like this is most often not desirable. Instead, there is an MI
@@ -1999,10 +2009,9 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
 
   // Check that the PHI nodes can be converted to selects.
   bool HaveRewritablePHIs = false;
-  for (BasicBlock::iterator I = EndBB->begin();
-       PHINode *PN = dyn_cast<PHINode>(I); ++I) {
-    Value *OrigV = PN->getIncomingValueForBlock(BB);
-    Value *ThenV = PN->getIncomingValueForBlock(ThenBB);
+  for (PHINode &PN : EndBB->phis()) {
+    Value *OrigV = PN.getIncomingValueForBlock(BB);
+    Value *ThenV = PN.getIncomingValueForBlock(ThenBB);
 
     // FIXME: Try to remove some of the duplication with HoistThenElseCodeToIf.
     // Skip PHIs which are trivial.
@@ -2010,8 +2019,8 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
       continue;
 
     // Don't convert to selects if we could remove undefined behavior instead.
-    if (passingValueIsAlwaysUndefined(OrigV, PN) ||
-        passingValueIsAlwaysUndefined(ThenV, PN))
+    if (passingValueIsAlwaysUndefined(OrigV, &PN) ||
+        passingValueIsAlwaysUndefined(ThenV, &PN))
       return false;
 
     HaveRewritablePHIs = true;
@@ -2045,7 +2054,7 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
     return false;
 
   // If we get here, we can hoist the instruction and if-convert.
-  DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";);
+  LLVM_DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";);
 
   // Insert a select of the value of the speculated store.
   if (SpeculatedStoreValue) {
@@ -2072,12 +2081,11 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
 
   // Insert selects and rewrite the PHI operands.
   IRBuilder<NoFolder> Builder(BI);
-  for (BasicBlock::iterator I = EndBB->begin();
-       PHINode *PN = dyn_cast<PHINode>(I); ++I) {
-    unsigned OrigI = PN->getBasicBlockIndex(BB);
-    unsigned ThenI = PN->getBasicBlockIndex(ThenBB);
-    Value *OrigV = PN->getIncomingValue(OrigI);
-    Value *ThenV = PN->getIncomingValue(ThenI);
+  for (PHINode &PN : EndBB->phis()) {
+    unsigned OrigI = PN.getBasicBlockIndex(BB);
+    unsigned ThenI = PN.getBasicBlockIndex(ThenBB);
+    Value *OrigV = PN.getIncomingValue(OrigI);
+    Value *ThenV = PN.getIncomingValue(ThenI);
 
     // Skip PHIs which are trivial.
     if (OrigV == ThenV)
@@ -2091,8 +2099,8 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
       std::swap(TrueV, FalseV);
     Value *V = Builder.CreateSelect(
         BrCond, TrueV, FalseV, "spec.select", BI);
-    PN->setIncomingValue(OrigI, V);
-    PN->setIncomingValue(ThenI, V);
+    PN.setIncomingValue(OrigI, V);
+    PN.setIncomingValue(ThenI, V);
   }
 
   // Remove speculated dbg intrinsics.
@@ -2107,19 +2115,16 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
 
 /// Return true if we can thread a branch across this block.
 static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
-  BranchInst *BI = cast<BranchInst>(BB->getTerminator());
   unsigned Size = 0;
 
-  for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) {
-    if (isa<DbgInfoIntrinsic>(BBI))
-      continue;
+  for (Instruction &I : BB->instructionsWithoutDebug()) {
     if (Size > 10)
       return false; // Don't clone large BB's.
     ++Size;
 
     // We can only support instructions that do not define values that are
     // live outside of the current basic block.
-    for (User *U : BBI->users()) {
+    for (User *U : I.users()) {
       Instruction *UI = cast<Instruction>(U);
       if (UI->getParent() != BB || isa<PHINode>(UI))
         return false;
@@ -2261,6 +2266,10 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   // dependence information for this check, but simplifycfg can't keep it up
   // to date, and this catches most of the cases we care about anyway.
   BasicBlock *BB = PN->getParent();
+  const Function *Fn = BB->getParent();
+  if (Fn && Fn->hasFnAttribute(Attribute::OptForFuzzing))
+    return false;
+
   BasicBlock *IfTrue, *IfFalse;
   Value *IfCond = GetIfCondition(BB, IfTrue, IfFalse);
   if (!IfCond ||
@@ -2351,8 +2360,9 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
       }
   }
 
-  DEBUG(dbgs() << "FOUND IF CONDITION!  " << *IfCond << "  T: "
-               << IfTrue->getName() << "  F: " << IfFalse->getName() << "\n");
+  LLVM_DEBUG(dbgs() << "FOUND IF CONDITION!  " << *IfCond
+                    << "  T: " << IfTrue->getName()
+                    << "  F: " << IfFalse->getName() << "\n");
 
   // If we can still promote the PHI nodes after this gauntlet of tests,
   // do all of the PHI's now.
@@ -2476,9 +2486,9 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
 
   (void)RI;
 
-  DEBUG(dbgs() << "\nCHANGING BRANCH TO TWO RETURNS INTO SELECT:"
-               << "\n  " << *BI << "NewRet = " << *RI
-               << "TRUEBLOCK: " << *TrueSucc << "FALSEBLOCK: " << *FalseSucc);
+  LLVM_DEBUG(dbgs() << "\nCHANGING BRANCH TO TWO RETURNS INTO SELECT:"
+                    << "\n  " << *BI << "NewRet = " << *RI << "TRUEBLOCK: "
+                    << *TrueSucc << "FALSEBLOCK: " << *FalseSucc);
 
   EraseTerminatorInstAndDCECond(BI);
 
@@ -2487,7 +2497,7 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
 
 /// Return true if the given instruction is available
 /// in its predecessor block. If yes, the instruction will be removed.
-static bool checkCSEInPredecessor(Instruction *Inst, BasicBlock *PB) {
+static bool tryCSEWithPredecessor(Instruction *Inst, BasicBlock *PB) {
   if (!isa<BinaryOperator>(Inst) && !isa<CmpInst>(Inst))
     return false;
   for (Instruction &I : *PB) {
@@ -2544,14 +2554,16 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
         if (PBI->isConditional() &&
             (BI->getSuccessor(0) == PBI->getSuccessor(0) ||
              BI->getSuccessor(0) == PBI->getSuccessor(1))) {
-          for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+          for (auto I = BB->instructionsWithoutDebug().begin(),
+                    E = BB->instructionsWithoutDebug().end();
+               I != E;) {
             Instruction *Curr = &*I++;
             if (isa<CmpInst>(Curr)) {
               Cond = Curr;
               break;
             }
             // Quit if we can't remove this instruction.
-            if (!checkCSEInPredecessor(Curr, PB))
+            if (!tryCSEWithPredecessor(Curr, PB))
               return false;
           }
         }
@@ -2651,7 +2663,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
         continue;
     }
 
-    DEBUG(dbgs() << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB);
+    LLVM_DEBUG(dbgs() << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB);
     IRBuilder<> Builder(PBI);
 
     // If we need to invert the condition in the pred block to match, do so now.
@@ -2861,7 +2873,7 @@ static Value *ensureValueAvailableInSuccessor(Value *V, BasicBlock *BB,
       if (!AlternativeV)
         break;
 
-      assert(std::distance(pred_begin(Succ), pred_end(Succ)) == 2);
+      assert(pred_size(Succ) == 2);
       auto PredI = pred_begin(Succ);
       BasicBlock *OtherPredBB = *PredI == BB ? *++PredI : *PredI;
       if (PHI->getIncomingValueForBlock(OtherPredBB) == AlternativeV)
@@ -2904,14 +2916,13 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
     // instructions inside are all cheap (arithmetic/GEPs), it's worthwhile to
     // thread this store.
     unsigned N = 0;
-    for (auto &I : *BB) {
+    for (auto &I : BB->instructionsWithoutDebug()) {
       // Cheap instructions viable for folding.
       if (isa<BinaryOperator>(I) || isa<GetElementPtrInst>(I) ||
           isa<StoreInst>(I))
         ++N;
       // Free instructions.
-      else if (isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) ||
-               IsaBitcastOfPointerType(I))
+      else if (isa<TerminatorInst>(I) || IsaBitcastOfPointerType(I))
         continue;
       else
         return false;
@@ -2966,6 +2977,21 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
     if (&*I != PStore && I->mayReadOrWriteMemory())
       return false;
 
+  // If PostBB has more than two predecessors, we need to split it so we can
+  // sink the store.
+  if (std::next(pred_begin(PostBB), 2) != pred_end(PostBB)) {
+    // We know that QFB's only successor is PostBB. And QFB has a single
+    // predecessor. If QTB exists, then its only successor is also PostBB.
+    // If QTB does not exist, then QFB's only predecessor has a conditional
+    // branch to QFB and PostBB.
+    BasicBlock *TruePred = QTB ? QTB : QFB->getSinglePredecessor();
+    BasicBlock *NewBB = SplitBlockPredecessors(PostBB, { QFB, TruePred},
+                                               "condstore.split");
+    if (!NewBB)
+      return false;
+    PostBB = NewBB;
+  }
+
   // OK, we're going to sink the stores to PostBB. The store has to be
   // conditional though, so first create the predicate.
   Value *PCond = cast<BranchInst>(PFB->getSinglePredecessor()->getTerminator())
@@ -3101,7 +3127,7 @@ static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI,
   if ((PTB && !HasOnePredAndOneSucc(PTB, PBI->getParent(), QBI->getParent())) ||
       (QTB && !HasOnePredAndOneSucc(QTB, QBI->getParent(), PostBB)))
     return false;
-  if (!PostBB->hasNUses(2) || !QBI->getParent()->hasNUses(2))
+  if (!QBI->getParent()->hasNUses(2))
     return false;
 
   // OK, this is a sequence of two diamonds or triangles.
@@ -3201,11 +3227,9 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
   // If this is a conditional branch in an empty block, and if any
   // predecessors are a conditional branch to one of our destinations,
   // fold the conditions into logical ops and one cond br.
-  BasicBlock::iterator BBI = BB->begin();
+
   // Ignore dbg intrinsics.
-  while (isa<DbgInfoIntrinsic>(BBI))
-    ++BBI;
-  if (&*BBI != BI)
+  if (&*BB->instructionsWithoutDebug().begin() != BI)
     return false;
 
   int PBIOp, BIOp;
@@ -3262,8 +3286,8 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
   // Finally, if everything is ok, fold the branches to logical ops.
   BasicBlock *OtherDest = BI->getSuccessor(BIOp ^ 1);
 
-  DEBUG(dbgs() << "FOLDING BRs:" << *PBI->getParent()
-               << "AND: " << *BI->getParent());
+  LLVM_DEBUG(dbgs() << "FOLDING BRs:" << *PBI->getParent()
+                    << "AND: " << *BI->getParent());
 
   // If OtherDest *is* BB, then BB is a basic block with a single conditional
   // branch in it, where one edge (OtherDest) goes back to itself but the other
@@ -3281,7 +3305,7 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
     OtherDest = InfLoopBlock;
   }
 
-  DEBUG(dbgs() << *PBI->getParent()->getParent());
+  LLVM_DEBUG(dbgs() << *PBI->getParent()->getParent());
 
   // BI may have other predecessors.  Because of this, we leave
   // it alone, but modify PBI.
@@ -3335,17 +3359,15 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
   // it.  If it has PHIs though, the PHIs may have different
   // entries for BB and PBI's BB.  If so, insert a select to make
   // them agree.
-  PHINode *PN;
-  for (BasicBlock::iterator II = CommonDest->begin();
-       (PN = dyn_cast<PHINode>(II)); ++II) {
-    Value *BIV = PN->getIncomingValueForBlock(BB);
-    unsigned PBBIdx = PN->getBasicBlockIndex(PBI->getParent());
-    Value *PBIV = PN->getIncomingValue(PBBIdx);
+  for (PHINode &PN : CommonDest->phis()) {
+    Value *BIV = PN.getIncomingValueForBlock(BB);
+    unsigned PBBIdx = PN.getBasicBlockIndex(PBI->getParent());
+    Value *PBIV = PN.getIncomingValue(PBBIdx);
     if (BIV != PBIV) {
       // Insert a select in PBI to pick the right value.
       SelectInst *NV = cast<SelectInst>(
           Builder.CreateSelect(PBICond, PBIV, BIV, PBIV->getName() + ".mux"));
-      PN->setIncomingValue(PBBIdx, NV);
+      PN.setIncomingValue(PBBIdx, NV);
       // Although the select has the same condition as PBI, the original branch
       // weights for PBI do not apply to the new select because the select's
       // 'logical' edges are incoming edges of the phi that is eliminated, not
@@ -3367,8 +3389,8 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
     }
   }
 
-  DEBUG(dbgs() << "INTO: " << *PBI->getParent());
-  DEBUG(dbgs() << *PBI->getParent()->getParent());
+  LLVM_DEBUG(dbgs() << "INTO: " << *PBI->getParent());
+  LLVM_DEBUG(dbgs() << *PBI->getParent()->getParent());
 
   // This basic block is probably dead.  We know it has at least
   // one fewer predecessor.
@@ -3668,9 +3690,9 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
 
   BasicBlock *BB = BI->getParent();
 
-  DEBUG(dbgs() << "Converting 'icmp' chain with " << Values.size()
-               << " cases into SWITCH.  BB is:\n"
-               << *BB);
+  LLVM_DEBUG(dbgs() << "Converting 'icmp' chain with " << Values.size()
+                    << " cases into SWITCH.  BB is:\n"
+                    << *BB);
 
   // If there are any extra values that couldn't be folded into the switch
   // then we evaluate them with an explicit branch first.  Split the block
@@ -3693,8 +3715,8 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
     // for the edge we just added.
     AddPredecessorToBlock(EdgeBB, BB, NewBB);
 
-    DEBUG(dbgs() << "  ** 'icmp' chain unhandled condition: " << *ExtraCase
-                 << "\nEXTRABB = " << *BB);
+    LLVM_DEBUG(dbgs() << "  ** 'icmp' chain unhandled condition: " << *ExtraCase
+                      << "\nEXTRABB = " << *BB);
     BB = NewBB;
   }
 
@@ -3725,7 +3747,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
   // Erase the old branch instruction.
   EraseTerminatorInstAndDCECond(BI);
 
-  DEBUG(dbgs() << "  ** 'icmp' chain result is:\n" << *BB << '\n');
+  LLVM_DEBUG(dbgs() << "  ** 'icmp' chain result is:\n" << *BB << '\n');
   return true;
 }
 
@@ -3876,6 +3898,7 @@ static bool removeEmptyCleanup(CleanupReturnInst *RI) {
     switch (IntrinsicID) {
     case Intrinsic::dbg_declare:
     case Intrinsic::dbg_value:
+    case Intrinsic::dbg_label:
     case Intrinsic::lifetime_end:
       break;
     default:
@@ -4052,8 +4075,8 @@ bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
   if (!UncondBranchPreds.empty() && DupRet) {
     while (!UncondBranchPreds.empty()) {
       BasicBlock *Pred = UncondBranchPreds.pop_back_val();
-      DEBUG(dbgs() << "FOLDING: " << *BB
-                   << "INTO UNCOND BRANCH PRED: " << *Pred);
+      LLVM_DEBUG(dbgs() << "FOLDING: " << *BB
+                        << "INTO UNCOND BRANCH PRED: " << *Pred);
       (void)FoldReturnIntoUncondBranch(RI, BB, Pred);
     }
 
@@ -4377,7 +4400,8 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
     if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) ||
         (CaseVal.getMinSignedBits() > MaxSignificantBitsInCond)) {
       DeadCases.push_back(Case.getCaseValue());
-      DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal << " is dead.\n");
+      LLVM_DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal
+                        << " is dead.\n");
     }
   }
 
@@ -4393,7 +4417,7 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
   if (HasDefault && DeadCases.empty() &&
       NumUnknownBits < 64 /* avoid overflow */ &&
       SI->getNumCases() == (1ULL << NumUnknownBits)) {
-    DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
+    LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
     BasicBlock *NewDefault =
         SplitBlockPredecessors(SI->getDefaultDest(), SI->getParent(), "");
     SI->setDefaultDest(&*NewDefault);
@@ -4451,17 +4475,16 @@ static PHINode *FindPHIForConditionForwarding(ConstantInt *CaseValue,
 
   BasicBlock *Succ = Branch->getSuccessor(0);
 
-  BasicBlock::iterator I = Succ->begin();
-  while (PHINode *PHI = dyn_cast<PHINode>(I++)) {
-    int Idx = PHI->getBasicBlockIndex(BB);
+  for (PHINode &PHI : Succ->phis()) {
+    int Idx = PHI.getBasicBlockIndex(BB);
     assert(Idx >= 0 && "PHI has no entry for predecessor?");
 
-    Value *InValue = PHI->getIncomingValue(Idx);
+    Value *InValue = PHI.getIncomingValue(Idx);
     if (InValue != CaseValue)
       continue;
 
     *PhiIndex = Idx;
-    return PHI;
+    return &PHI;
   }
 
   return nullptr;
@@ -4491,19 +4514,16 @@ static bool ForwardSwitchConditionToPHI(SwitchInst *SI) {
     // -->
     //     %r = phi i32 ... [ %x, %switchbb ] ...
 
-    for (Instruction &InstInCaseDest : *CaseDest) {
-      auto *Phi = dyn_cast<PHINode>(&InstInCaseDest);
-      if (!Phi) break;
-
+    for (PHINode &Phi : CaseDest->phis()) {
       // This only works if there is exactly 1 incoming edge from the switch to
       // a phi. If there is >1, that means multiple cases of the switch map to 1
       // value in the phi, and that phi value is not the switch condition. Thus,
       // this transform would not make sense (the phi would be invalid because
       // a phi can't have different incoming values from the same block).
-      int SwitchBBIdx = Phi->getBasicBlockIndex(SwitchBlock);
-      if (Phi->getIncomingValue(SwitchBBIdx) == CaseValue &&
-          count(Phi->blocks(), SwitchBlock) == 1) {
-        Phi->setIncomingValue(SwitchBBIdx, SI->getCondition());
+      int SwitchBBIdx = Phi.getBasicBlockIndex(SwitchBlock);
+      if (Phi.getIncomingValue(SwitchBBIdx) == CaseValue &&
+          count(Phi.blocks(), SwitchBlock) == 1) {
+        Phi.setIncomingValue(SwitchBBIdx, SI->getCondition());
         Changed = true;
       }
     }
@@ -4614,24 +4634,20 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
   // which we can constant-propagate the CaseVal, continue to its successor.
   SmallDenseMap<Value *, Constant *> ConstantPool;
   ConstantPool.insert(std::make_pair(SI->getCondition(), CaseVal));
-  for (BasicBlock::iterator I = CaseDest->begin(), E = CaseDest->end(); I != E;
-       ++I) {
-    if (TerminatorInst *T = dyn_cast<TerminatorInst>(I)) {
+  for (Instruction &I :CaseDest->instructionsWithoutDebug()) {
+    if (TerminatorInst *T = dyn_cast<TerminatorInst>(&I)) {
       // If the terminator is a simple branch, continue to the next block.
       if (T->getNumSuccessors() != 1 || T->isExceptional())
         return false;
       Pred = CaseDest;
       CaseDest = T->getSuccessor(0);
-    } else if (isa<DbgInfoIntrinsic>(I)) {
-      // Skip debug intrinsic.
-      continue;
-    } else if (Constant *C = ConstantFold(&*I, DL, ConstantPool)) {
+    } else if (Constant *C = ConstantFold(&I, DL, ConstantPool)) {
       // Instruction is side-effect free and constant.
 
       // If the instruction has uses outside this block or a phi node slot for
       // the block, it is not safe to bypass the instruction since it would then
       // no longer dominate all its uses.
-      for (auto &Use : I->uses()) {
+      for (auto &Use : I.uses()) {
         User *User = Use.getUser();
         if (Instruction *I = dyn_cast<Instruction>(User))
           if (I->getParent() == CaseDest)
@@ -4642,7 +4658,7 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
         return false;
       }
 
-      ConstantPool.insert(std::make_pair(&*I, C));
+      ConstantPool.insert(std::make_pair(&I, C));
     } else {
       break;
     }
@@ -4656,14 +4672,13 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
     return false;
 
   // Get the values for this case from phi nodes in the destination block.
-  BasicBlock::iterator I = (*CommonDest)->begin();
-  while (PHINode *PHI = dyn_cast<PHINode>(I++)) {
-    int Idx = PHI->getBasicBlockIndex(Pred);
+  for (PHINode &PHI : (*CommonDest)->phis()) {
+    int Idx = PHI.getBasicBlockIndex(Pred);
     if (Idx == -1)
       continue;
 
     Constant *ConstVal =
-        LookupConstant(PHI->getIncomingValue(Idx), ConstantPool);
+        LookupConstant(PHI.getIncomingValue(Idx), ConstantPool);
     if (!ConstVal)
       return false;
 
@@ -4671,37 +4686,38 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
     if (!ValidLookupTableConstant(ConstVal, TTI))
       return false;
 
-    Res.push_back(std::make_pair(PHI, ConstVal));
+    Res.push_back(std::make_pair(&PHI, ConstVal));
   }
 
   return Res.size() > 0;
 }
 
 // Helper function used to add CaseVal to the list of cases that generate
-// Result.
-static void MapCaseToResult(ConstantInt *CaseVal,
-                            SwitchCaseResultVectorTy &UniqueResults,
-                            Constant *Result) {
+// Result. Returns the updated number of cases that generate this result.
+static uintptr_t MapCaseToResult(ConstantInt *CaseVal,
+                                 SwitchCaseResultVectorTy &UniqueResults,
+                                 Constant *Result) {
   for (auto &I : UniqueResults) {
     if (I.first == Result) {
       I.second.push_back(CaseVal);
-      return;
+      return I.second.size();
     }
   }
   UniqueResults.push_back(
       std::make_pair(Result, SmallVector<ConstantInt *, 4>(1, CaseVal)));
+  return 1;
 }
 
 // Helper function that initializes a map containing
 // results for the PHI node of the common destination block for a switch
 // instruction. Returns false if multiple PHI nodes have been found or if
 // there is not a common destination block for the switch.
-static bool InitializeUniqueCases(SwitchInst *SI, PHINode *&PHI,
-                                  BasicBlock *&CommonDest,
-                                  SwitchCaseResultVectorTy &UniqueResults,
-                                  Constant *&DefaultResult,
-                                  const DataLayout &DL,
-                                  const TargetTransformInfo &TTI) {
+static bool
+InitializeUniqueCases(SwitchInst *SI, PHINode *&PHI, BasicBlock *&CommonDest,
+                      SwitchCaseResultVectorTy &UniqueResults,
+                      Constant *&DefaultResult, const DataLayout &DL,
+                      const TargetTransformInfo &TTI,
+                      uintptr_t MaxUniqueResults, uintptr_t MaxCasesPerResult) {
   for (auto &I : SI->cases()) {
     ConstantInt *CaseVal = I.getCaseValue();
 
@@ -4711,10 +4727,21 @@ static bool InitializeUniqueCases(SwitchInst *SI, PHINode *&PHI,
                         DL, TTI))
       return false;
 
-    // Only one value per case is permitted
+    // Only one value per case is permitted.
     if (Results.size() > 1)
       return false;
-    MapCaseToResult(CaseVal, UniqueResults, Results.begin()->second);
+
+    // Add the case->result mapping to UniqueResults.
+    const uintptr_t NumCasesForResult =
+        MapCaseToResult(CaseVal, UniqueResults, Results.begin()->second);
+
+    // Early out if there are too many cases for this result.
+    if (NumCasesForResult > MaxCasesPerResult)
+      return false;
+
+    // Early out if there are too many unique results.
+    if (UniqueResults.size() > MaxUniqueResults)
+      return false;
 
     // Check the PHI consistency.
     if (!PHI)
@@ -4814,7 +4841,7 @@ static bool switchToSelect(SwitchInst *SI, IRBuilder<> &Builder,
   SwitchCaseResultVectorTy UniqueResults;
   // Collect all the cases that will deliver the same value from the switch.
   if (!InitializeUniqueCases(SI, PHI, CommonDest, UniqueResults, DefaultResult,
-                             DL, TTI))
+                             DL, TTI, 2, 1))
     return false;
   // Selects choose between maximum two values.
   if (UniqueResults.size() != 2)
@@ -5392,8 +5419,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
   }
 
   bool ReturnedEarly = false;
-  for (size_t I = 0, E = PHIs.size(); I != E; ++I) {
-    PHINode *PHI = PHIs[I];
+  for (PHINode *PHI : PHIs) {
     const ResultListTy &ResultList = ResultLists[PHI];
 
     // If using a bitmask, use any value to fill the lookup table holes.
@@ -5483,7 +5509,7 @@ static bool ReduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder,
   SmallVector<int64_t,4> Values;
   for (auto &C : SI->cases())
     Values.push_back(C.getCaseValue()->getValue().getSExtValue());
-  std::sort(Values.begin(), Values.end());
+  llvm::sort(Values.begin(), Values.end());
 
   // If the switch is already dense, there's nothing useful to do here.
   if (isSwitchDense(Values))
@@ -5566,11 +5592,7 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
 
     // If the block only contains the switch, see if we can fold the block
     // away into any preds.
-    BasicBlock::iterator BBI = BB->begin();
-    // Ignore dbg intrinsics.
-    while (isa<DbgInfoIntrinsic>(BBI))
-      ++BBI;
-    if (SI == &*BBI)
+    if (SI == &*BB->instructionsWithoutDebug().begin())
       if (FoldValueComparisonIntoPredecessors(SI, Builder))
         return simplifyCFG(BB, TTI, Options) | true;
   }
@@ -5657,7 +5679,7 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
 /// any transform which might inhibit optimization (such as our ability to
 /// specialize a particular handler via tail commoning).  We do this by not
 /// merging any blocks which require us to introduce a phi.  Since the same
-/// values are flowing through both blocks, we don't loose any ability to
+/// values are flowing through both blocks, we don't lose any ability to
 /// specialize.  If anything, we make such specialization more likely.
 ///
 /// TODO - This transformation could remove entries from a phi in the target
@@ -5687,7 +5709,7 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
 
     // We've found an identical block.  Update our predecessors to take that
     // path instead and make ourselves dead.
-    SmallSet<BasicBlock *, 16> Preds;
+    SmallPtrSet<BasicBlock *, 16> Preds;
     Preds.insert(pred_begin(BB), pred_end(BB));
     for (BasicBlock *Pred : Preds) {
       InvokeInst *II = cast<InvokeInst>(Pred->getTerminator());
@@ -5705,7 +5727,7 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
         Inst.eraseFromParent();
     }
 
-    SmallSet<BasicBlock *, 16> Succs;
+    SmallPtrSet<BasicBlock *, 16> Succs;
     Succs.insert(succ_begin(BB), succ_end(BB));
     for (BasicBlock *Succ : Succs) {
       Succ->removePredecessor(BB);
@@ -5729,9 +5751,12 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI,
   // header. (This is for early invocations before loop simplify and
   // vectorization to keep canonical loop forms for nested loops. These blocks
   // can be eliminated when the pass is invoked later in the back-end.)
+  // Note that if BB has only one predecessor then we do not introduce new
+  // backedge, so we can eliminate BB.
   bool NeedCanonicalLoop =
       Options.NeedCanonicalLoop &&
-      (LoopHeaders && (LoopHeaders->count(BB) || LoopHeaders->count(Succ)));
+      (LoopHeaders && pred_size(BB) > 1 &&
+       (LoopHeaders->count(BB) || LoopHeaders->count(Succ)));
   BasicBlock::iterator I = BB->getFirstNonPHIOrDbg()->getIterator();
   if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() &&
       !NeedCanonicalLoop && TryToSimplifyUncondBranchFromEmptyBlock(BB))
@@ -5779,6 +5804,9 @@ static BasicBlock *allPredecessorsComeFromSameSource(BasicBlock *BB) {
 
 bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   BasicBlock *BB = BI->getParent();
+  const Function *Fn = BB->getParent();
+  if (Fn && Fn->hasFnAttribute(Attribute::OptForFuzzing))
+    return false;
 
   // Conditional branch
   if (isValueEqualityComparison(BI)) {
@@ -5791,18 +5819,12 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
 
     // This block must be empty, except for the setcond inst, if it exists.
     // Ignore dbg intrinsics.
-    BasicBlock::iterator I = BB->begin();
-    // Ignore dbg intrinsics.
-    while (isa<DbgInfoIntrinsic>(I))
-      ++I;
+    auto I = BB->instructionsWithoutDebug().begin();
     if (&*I == BI) {
       if (FoldValueComparisonIntoPredecessors(BI, Builder))
         return simplifyCFG(BB, TTI, Options) | true;
     } else if (&*I == cast<Instruction>(BI->getCondition())) {
       ++I;
-      // Ignore dbg intrinsics.
-      while (isa<DbgInfoIntrinsic>(I))
-        ++I;
       if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder))
         return simplifyCFG(BB, TTI, Options) | true;
     }
@@ -5928,17 +5950,20 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) {
     // Load from null is undefined.
     if (LoadInst *LI = dyn_cast<LoadInst>(Use))
       if (!LI->isVolatile())
-        return LI->getPointerAddressSpace() == 0;
+        return !NullPointerIsDefined(LI->getFunction(),
+                                     LI->getPointerAddressSpace());
 
     // Store to null is undefined.
     if (StoreInst *SI = dyn_cast<StoreInst>(Use))
       if (!SI->isVolatile())
-        return SI->getPointerAddressSpace() == 0 &&
+        return (!NullPointerIsDefined(SI->getFunction(),
+                                      SI->getPointerAddressSpace())) &&
                SI->getPointerOperand() == I;
 
     // A call to null is undefined.
     if (auto CS = CallSite(Use))
-      return CS.getCalledValue() == I;
+      return !NullPointerIsDefined(CS->getFunction()) &&
+             CS.getCalledValue() == I;
   }
   return false;
 }
@@ -5946,14 +5971,13 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) {
 /// If BB has an incoming value that will always trigger undefined behavior
 /// (eg. null pointer dereference), remove the branch leading here.
 static bool removeUndefIntroducingPredecessor(BasicBlock *BB) {
-  for (BasicBlock::iterator i = BB->begin();
-       PHINode *PHI = dyn_cast<PHINode>(i); ++i)
-    for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i)
-      if (passingValueIsAlwaysUndefined(PHI->getIncomingValue(i), PHI)) {
-        TerminatorInst *T = PHI->getIncomingBlock(i)->getTerminator();
+  for (PHINode &PHI : BB->phis())
+    for (unsigned i = 0, e = PHI.getNumIncomingValues(); i != e; ++i)
+      if (passingValueIsAlwaysUndefined(PHI.getIncomingValue(i), &PHI)) {
+        TerminatorInst *T = PHI.getIncomingBlock(i)->getTerminator();
         IRBuilder<> Builder(T);
         if (BranchInst *BI = dyn_cast<BranchInst>(T)) {
-          BB->removePredecessor(PHI->getIncomingBlock(i));
+          BB->removePredecessor(PHI.getIncomingBlock(i));
           // Turn uncoditional branches into unreachables and remove the dead
           // destination from conditional branches.
           if (BI->isUnconditional())
@@ -5980,7 +6004,7 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
   // or that just have themself as a predecessor.  These are unreachable.
   if ((pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) ||
       BB->getSinglePredecessor() == BB) {
-    DEBUG(dbgs() << "Removing BB: \n" << *BB);
+    LLVM_DEBUG(dbgs() << "Removing BB: \n" << *BB);
     DeleteDeadBlock(BB);
     return true;
   }
diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index ad1faea0a7ae..e381fbc34ab4 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -26,6 +26,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
 
 using namespace llvm;
 
@@ -80,6 +81,7 @@ namespace {
     bool replaceIVUserWithLoopInvariant(Instruction *UseInst);
 
     bool eliminateOverflowIntrinsic(CallInst *CI);
+    bool eliminateTrunc(TruncInst *TI);
     bool eliminateIVUser(Instruction *UseInst, Instruction *IVOperand);
     bool makeIVComparisonInvariant(ICmpInst *ICmp, Value *IVOperand);
     void eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand);
@@ -147,8 +149,8 @@ Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand)
   if (SE->getSCEV(UseInst) != FoldedExpr)
     return nullptr;
 
-  DEBUG(dbgs() << "INDVARS: Eliminated IV operand: " << *IVOperand
-        << " -> " << *UseInst << '\n');
+  LLVM_DEBUG(dbgs() << "INDVARS: Eliminated IV operand: " << *IVOperand
+                    << " -> " << *UseInst << '\n');
 
   UseInst->setOperand(OperIdx, IVSrc);
   assert(SE->getSCEV(UseInst) == FoldedExpr && "bad SCEV with folded oper");
@@ -221,7 +223,7 @@ bool SimplifyIndvar::makeIVComparisonInvariant(ICmpInst *ICmp,
     // for now.
     return false;
 
-  DEBUG(dbgs() << "INDVARS: Simplified comparison: " << *ICmp << '\n');
+  LLVM_DEBUG(dbgs() << "INDVARS: Simplified comparison: " << *ICmp << '\n');
   ICmp->setPredicate(InvariantPredicate);
   ICmp->setOperand(0, NewLHS);
   ICmp->setOperand(1, NewRHS);
@@ -252,11 +254,11 @@ void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
   if (SE->isKnownPredicate(Pred, S, X)) {
     ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext()));
     DeadInsts.emplace_back(ICmp);
-    DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
+    LLVM_DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
   } else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X)) {
     ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext()));
     DeadInsts.emplace_back(ICmp);
-    DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
+    LLVM_DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
   } else if (makeIVComparisonInvariant(ICmp, IVOperand)) {
     // fallthrough to end of function
   } else if (ICmpInst::isSigned(OriginalPred) &&
@@ -267,7 +269,8 @@ void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
     // we turn the instruction's predicate to its unsigned version. Note that
     // we cannot rely on Pred here unless we check if we have swapped it.
     assert(ICmp->getPredicate() == OriginalPred && "Predicate changed?");
-    DEBUG(dbgs() << "INDVARS: Turn to unsigned comparison: " << *ICmp << '\n');
+    LLVM_DEBUG(dbgs() << "INDVARS: Turn to unsigned comparison: " << *ICmp
+                      << '\n');
     ICmp->setPredicate(ICmpInst::getUnsignedPredicate(OriginalPred));
   } else
     return;
@@ -293,7 +296,7 @@ bool SimplifyIndvar::eliminateSDiv(BinaryOperator *SDiv) {
         SDiv->getName() + ".udiv", SDiv);
     UDiv->setIsExact(SDiv->isExact());
     SDiv->replaceAllUsesWith(UDiv);
-    DEBUG(dbgs() << "INDVARS: Simplified sdiv: " << *SDiv << '\n');
+    LLVM_DEBUG(dbgs() << "INDVARS: Simplified sdiv: " << *SDiv << '\n');
     ++NumSimplifiedSDiv;
     Changed = true;
     DeadInsts.push_back(SDiv);
@@ -309,7 +312,7 @@ void SimplifyIndvar::replaceSRemWithURem(BinaryOperator *Rem) {
   auto *URem = BinaryOperator::Create(BinaryOperator::URem, N, D,
                                       Rem->getName() + ".urem", Rem);
   Rem->replaceAllUsesWith(URem);
-  DEBUG(dbgs() << "INDVARS: Simplified srem: " << *Rem << '\n');
+  LLVM_DEBUG(dbgs() << "INDVARS: Simplified srem: " << *Rem << '\n');
   ++NumSimplifiedSRem;
   Changed = true;
   DeadInsts.emplace_back(Rem);
@@ -318,7 +321,7 @@ void SimplifyIndvar::replaceSRemWithURem(BinaryOperator *Rem) {
 // i % n  -->  i  if i is in [0,n).
 void SimplifyIndvar::replaceRemWithNumerator(BinaryOperator *Rem) {
   Rem->replaceAllUsesWith(Rem->getOperand(0));
-  DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
+  LLVM_DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
   ++NumElimRem;
   Changed = true;
   DeadInsts.emplace_back(Rem);
@@ -332,7 +335,7 @@ void SimplifyIndvar::replaceRemWithNumeratorOrZero(BinaryOperator *Rem) {
   SelectInst *Sel =
       SelectInst::Create(ICmp, ConstantInt::get(T, 0), N, "iv.rem", Rem);
   Rem->replaceAllUsesWith(Sel);
-  DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
+  LLVM_DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
   ++NumElimRem;
   Changed = true;
   DeadInsts.emplace_back(Rem);
@@ -492,6 +495,118 @@ bool SimplifyIndvar::eliminateOverflowIntrinsic(CallInst *CI) {
   return true;
 }
 
+bool SimplifyIndvar::eliminateTrunc(TruncInst *TI) {
+  // It is always legal to replace
+  //   icmp <pred> i32 trunc(iv), n
+  // with
+  //   icmp <pred> i64 sext(trunc(iv)), sext(n), if pred is signed predicate.
+  // Or with
+  //   icmp <pred> i64 zext(trunc(iv)), zext(n), if pred is unsigned predicate.
+  // Or with either of these if pred is an equality predicate.
+  //
+  // If we can prove that iv == sext(trunc(iv)) or iv == zext(trunc(iv)) for
+  // every comparison which uses trunc, it means that we can replace each of
+  // them with comparison of iv against sext/zext(n). We no longer need trunc
+  // after that.
+  //
+  // TODO: Should we do this if we can widen *some* comparisons, but not all
+  // of them? Sometimes it is enough to enable other optimizations, but the
+  // trunc instruction will stay in the loop.
+  Value *IV = TI->getOperand(0);
+  Type *IVTy = IV->getType();
+  const SCEV *IVSCEV = SE->getSCEV(IV);
+  const SCEV *TISCEV = SE->getSCEV(TI);
+
+  // Check if iv == zext(trunc(iv)) and if iv == sext(trunc(iv)). If so, we can
+  // get rid of trunc
+  bool DoesSExtCollapse = false;
+  bool DoesZExtCollapse = false;
+  if (IVSCEV == SE->getSignExtendExpr(TISCEV, IVTy))
+    DoesSExtCollapse = true;
+  if (IVSCEV == SE->getZeroExtendExpr(TISCEV, IVTy))
+    DoesZExtCollapse = true;
+
+  // If neither sext nor zext does collapse, it is not profitable to do any
+  // transform. Bail.
+  if (!DoesSExtCollapse && !DoesZExtCollapse)
+    return false;
+
+  // Collect users of the trunc that look like comparisons against invariants.
+  // Bail if we find something different.
+  SmallVector<ICmpInst *, 4> ICmpUsers;
+  for (auto *U : TI->users()) {
+    // We don't care about users in unreachable blocks.
+    if (isa<Instruction>(U) &&
+        !DT->isReachableFromEntry(cast<Instruction>(U)->getParent()))
+      continue;
+    if (ICmpInst *ICI = dyn_cast<ICmpInst>(U)) {
+      if (ICI->getOperand(0) == TI && L->isLoopInvariant(ICI->getOperand(1))) {
+        assert(L->contains(ICI->getParent()) && "LCSSA form broken?");
+        // If we cannot get rid of trunc, bail.
+        if (ICI->isSigned() && !DoesSExtCollapse)
+          return false;
+        if (ICI->isUnsigned() && !DoesZExtCollapse)
+          return false;
+        // For equality, either signed or unsigned works.
+        ICmpUsers.push_back(ICI);
+      } else
+        return false;
+    } else
+      return false;
+  }
+
+  auto CanUseZExt = [&](ICmpInst *ICI) {
+    // Unsigned comparison can be widened as unsigned.
+    if (ICI->isUnsigned())
+      return true;
+    // Is it profitable to do zext?
+    if (!DoesZExtCollapse)
+      return false;
+    // For equality, we can safely zext both parts.
+    if (ICI->isEquality())
+      return true;
+    // Otherwise we can only use zext when comparing two non-negative or two
+    // negative values. But in practice, we will never pass DoesZExtCollapse
+    // check for a negative value, because zext(trunc(x)) is non-negative. So
+    // it only make sense to check for non-negativity here.
+    const SCEV *SCEVOP1 = SE->getSCEV(ICI->getOperand(0));
+    const SCEV *SCEVOP2 = SE->getSCEV(ICI->getOperand(1));
+    return SE->isKnownNonNegative(SCEVOP1) && SE->isKnownNonNegative(SCEVOP2);
+  };
+  // Replace all comparisons against trunc with comparisons against IV.
+  for (auto *ICI : ICmpUsers) {
+    auto *Op1 = ICI->getOperand(1);
+    Instruction *Ext = nullptr;
+    // For signed/unsigned predicate, replace the old comparison with comparison
+    // of immediate IV against sext/zext of the invariant argument. If we can
+    // use either sext or zext (i.e. we are dealing with equality predicate),
+    // then prefer zext as a more canonical form.
+    // TODO: If we see a signed comparison which can be turned into unsigned,
+    // we can do it here for canonicalization purposes.
+    ICmpInst::Predicate Pred = ICI->getPredicate();
+    if (CanUseZExt(ICI)) {
+      assert(DoesZExtCollapse && "Unprofitable zext?");
+      Ext = new ZExtInst(Op1, IVTy, "zext", ICI);
+      Pred = ICmpInst::getUnsignedPredicate(Pred);
+    } else {
+      assert(DoesSExtCollapse && "Unprofitable sext?");
+      Ext = new SExtInst(Op1, IVTy, "sext", ICI);
+      assert(Pred == ICmpInst::getSignedPredicate(Pred) && "Must be signed!");
+    }
+    bool Changed;
+    L->makeLoopInvariant(Ext, Changed);
+    (void)Changed;
+    ICmpInst *NewICI = new ICmpInst(ICI, Pred, IV, Ext);
+    ICI->replaceAllUsesWith(NewICI);
+    DeadInsts.emplace_back(ICI);
+  }
+
+  // Trunc no longer needed.
+  TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
+  DeadInsts.emplace_back(TI);
+  return true;
+}
+
 /// Eliminate an operation that consumes a simple IV and has no observable
 /// side-effect given the range of IV values.  IVOperand is guaranteed SCEVable,
 /// but UseInst may not be.
@@ -516,6 +631,10 @@ bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst,
     if (eliminateOverflowIntrinsic(CI))
       return true;
 
+  if (auto *TI = dyn_cast<TruncInst>(UseInst))
+    if (eliminateTrunc(TI))
+      return true;
+
   if (eliminateIdentitySCEV(UseInst, IVOperand))
     return true;
 
@@ -548,8 +667,8 @@ bool SimplifyIndvar::replaceIVUserWithLoopInvariant(Instruction *I) {
   auto *Invariant = Rewriter.expandCodeFor(S, I->getType(), IP);
 
   I->replaceAllUsesWith(Invariant);
-  DEBUG(dbgs() << "INDVARS: Replace IV user: " << *I
-               << " with loop invariant: " << *S << '\n');
+  LLVM_DEBUG(dbgs() << "INDVARS: Replace IV user: " << *I
+                    << " with loop invariant: " << *S << '\n');
   ++NumFoldedUser;
   Changed = true;
   DeadInsts.emplace_back(I);
@@ -589,7 +708,7 @@ bool SimplifyIndvar::eliminateIdentitySCEV(Instruction *UseInst,
   if (!LI->replacementPreservesLCSSAForm(UseInst, IVOperand))
     return false;
 
-  DEBUG(dbgs() << "INDVARS: Eliminated identity: " << *UseInst << '\n');
+  LLVM_DEBUG(dbgs() << "INDVARS: Eliminated identity: " << *UseInst << '\n');
 
   UseInst->replaceAllUsesWith(IVOperand);
   ++NumElimIdentity;
@@ -771,6 +890,15 @@ void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
       SimpleIVUsers.pop_back_val();
     Instruction *UseInst = UseOper.first;
 
+    // If a user of the IndVar is trivially dead, we prefer just to mark it dead
+    // rather than try to do some complex analysis or transformation (such as
+    // widening) basing on it.
+    // TODO: Propagate TLI and pass it here to handle more cases.
+    if (isInstructionTriviallyDead(UseInst, /* TLI */ nullptr)) {
+      DeadInsts.emplace_back(UseInst);
+      continue;
+    }
+
     // Bypass back edges to avoid extra work.
     if (UseInst == CurrIV) continue;
 
@@ -783,7 +911,7 @@ void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
     for (unsigned N = 0; IVOperand; ++N) {
       assert(N <= Simplified.size() && "runaway iteration");
 
-      Value *NewOper = foldIVUser(UseOper.first, IVOperand);
+      Value *NewOper = foldIVUser(UseInst, IVOperand);
       if (!NewOper)
         break; // done folding
       IVOperand = dyn_cast<Instruction>(NewOper);
@@ -791,12 +919,12 @@ void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
     if (!IVOperand)
       continue;
 
-    if (eliminateIVUser(UseOper.first, IVOperand)) {
+    if (eliminateIVUser(UseInst, IVOperand)) {
       pushIVUsers(IVOperand, L, Simplified, SimpleIVUsers);
       continue;
     }
 
-    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(UseOper.first)) {
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(UseInst)) {
       if ((isa<OverflowingBinaryOperator>(BO) &&
            strengthenOverflowingOperation(BO, IVOperand)) ||
           (isa<ShlOperator>(BO) && strengthenRightShift(BO, IVOperand))) {
@@ -806,13 +934,13 @@ void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
       }
     }
 
-    CastInst *Cast = dyn_cast<CastInst>(UseOper.first);
+    CastInst *Cast = dyn_cast<CastInst>(UseInst);
     if (V && Cast) {
       V->visitCast(Cast);
       continue;
     }
-    if (isSimpleIVUser(UseOper.first, L, SE)) {
-      pushIVUsers(UseOper.first, L, Simplified, SimpleIVUsers);
+    if (isSimpleIVUser(UseInst, L, SE)) {
+      pushIVUsers(UseInst, L, Simplified, SimpleIVUsers);
     }
   }
 }
diff --git a/lib/Transforms/Utils/SimplifyInstructions.cpp b/lib/Transforms/Utils/SimplifyInstructions.cpp
deleted file mode 100644
index f3d4f2ef38d7..000000000000
--- a/lib/Transforms/Utils/SimplifyInstructions.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-//===------ SimplifyInstructions.cpp - Remove redundant instructions ------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This is a utility pass used for testing the InstructionSimplify analysis.
-// The analysis is applied to every instruction, and if it simplifies then the
-// instruction is replaced by the simplification.  If you are looking for a pass
-// that performs serious instruction folding, use the instcombine pass instead.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/SimplifyInstructions.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Type.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "instsimplify"
-
-STATISTIC(NumSimplified, "Number of redundant instructions removed");
-
-static bool runImpl(Function &F, const SimplifyQuery &SQ,
-                    OptimizationRemarkEmitter *ORE) {
-  SmallPtrSet<const Instruction *, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
-  bool Changed = false;
-
-  do {
-    for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
-      // Here be subtlety: the iterator must be incremented before the loop
-      // body (not sure why), so a range-for loop won't work here.
-      for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
-        Instruction *I = &*BI++;
-        // The first time through the loop ToSimplify is empty and we try to
-        // simplify all instructions.  On later iterations ToSimplify is not
-        // empty and we only bother simplifying instructions that are in it.
-        if (!ToSimplify->empty() && !ToSimplify->count(I))
-          continue;
-
-        // Don't waste time simplifying unused instructions.
-        if (!I->use_empty()) {
-          if (Value *V = SimplifyInstruction(I, SQ, ORE)) {
-            // Mark all uses for resimplification next time round the loop.
-            for (User *U : I->users())
-              Next->insert(cast<Instruction>(U));
-            I->replaceAllUsesWith(V);
-            ++NumSimplified;
-            Changed = true;
-          }
-        }
-        if (RecursivelyDeleteTriviallyDeadInstructions(I, SQ.TLI)) {
-          // RecursivelyDeleteTriviallyDeadInstruction can remove more than one
-          // instruction, so simply incrementing the iterator does not work.
-          // When instructions get deleted re-iterate instead.
-          BI = BB->begin();
-          BE = BB->end();
-          Changed = true;
-        }
-      }
-    }
-
-    // Place the list of instructions to simplify on the next loop iteration
-    // into ToSimplify.
-    std::swap(ToSimplify, Next);
-    Next->clear();
-  } while (!ToSimplify->empty());
-
-  return Changed;
-}
-
-namespace {
-  struct InstSimplifier : public FunctionPass {
-    static char ID; // Pass identification, replacement for typeid
-    InstSimplifier() : FunctionPass(ID) {
-      initializeInstSimplifierPass(*PassRegistry::getPassRegistry());
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesCFG();
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<AssumptionCacheTracker>();
-      AU.addRequired<TargetLibraryInfoWrapperPass>();
-      AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
-    }
-
-    /// runOnFunction - Remove instructions that simplify.
-    bool runOnFunction(Function &F) override {
-      if (skipFunction(F))
-        return false;
-
-      const DominatorTree *DT =
-          &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-      const TargetLibraryInfo *TLI =
-          &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-      AssumptionCache *AC =
-          &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-      OptimizationRemarkEmitter *ORE =
-          &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
-      const DataLayout &DL = F.getParent()->getDataLayout();
-      const SimplifyQuery SQ(DL, TLI, DT, AC);
-      return runImpl(F, SQ, ORE);
-    }
-  };
-}
-
-char InstSimplifier::ID = 0;
-INITIALIZE_PASS_BEGIN(InstSimplifier, "instsimplify",
-                      "Remove redundant instructions", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
-INITIALIZE_PASS_END(InstSimplifier, "instsimplify",
-                    "Remove redundant instructions", false, false)
-char &llvm::InstructionSimplifierID = InstSimplifier::ID;
-
-// Public interface to the simplify instructions pass.
-FunctionPass *llvm::createInstructionSimplifierPass() {
-  return new InstSimplifier();
-}
-
-PreservedAnalyses InstSimplifierPass::run(Function &F,
-                                      FunctionAnalysisManager &AM) {
-  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
-  auto &AC = AM.getResult<AssumptionAnalysis>(F);
-  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
-  const DataLayout &DL = F.getParent()->getDataLayout();
-  const SimplifyQuery SQ(DL, &TLI, &DT, &AC);
-  bool Changed = runImpl(F, SQ, &ORE);
-  if (!Changed)
-    return PreservedAnalyses::all();
-
-  PreservedAnalyses PA;
-  PA.preserveSet<CFGAnalyses>();
-  return PA;
-}
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 03a1d55ddc30..8c48597fc2e4 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -7,10 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This is a utility pass used for testing the InstructionSimplify analysis.
-// The analysis is applied to every instruction, and if it simplifies then the
-// instruction is replaced by the simplification.  If you are looking for a pass
-// that performs serious instruction folding, use the instcombine pass instead.
+// This file implements the library calls simplifier. It does not implement
+// any pass, but can't be used by other passes to do simplifications.
 //
 //===----------------------------------------------------------------------===//
 
@@ -21,7 +19,9 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -33,7 +33,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
-#include "llvm/Transforms/Utils/Local.h"
 
 using namespace llvm;
 using namespace PatternMatch;
@@ -104,19 +103,51 @@ static bool callHasFloatingPointArgument(const CallInst *CI) {
   });
 }
 
-/// \brief Check whether the overloaded unary floating point function
-/// corresponding to \a Ty is available.
-static bool hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
-                            LibFunc DoubleFn, LibFunc FloatFn,
-                            LibFunc LongDoubleFn) {
-  switch (Ty->getTypeID()) {
-  case Type::FloatTyID:
-    return TLI->has(FloatFn);
-  case Type::DoubleTyID:
-    return TLI->has(DoubleFn);
-  default:
-    return TLI->has(LongDoubleFn);
-  }
+static Value *convertStrToNumber(CallInst *CI, StringRef &Str, int64_t Base) {
+  if (Base < 2 || Base > 36)
+    // handle special zero base
+    if (Base != 0)
+      return nullptr;
+
+  char *End;
+  std::string nptr = Str.str();
+  errno = 0;
+  long long int Result = strtoll(nptr.c_str(), &End, Base);
+  if (errno)
+    return nullptr;
+
+  // if we assume all possible target locales are ASCII supersets,
+  // then if strtoll successfully parses a number on the host,
+  // it will also successfully parse the same way on the target
+  if (*End != '\0')
+    return nullptr;
+
+  if (!isIntN(CI->getType()->getPrimitiveSizeInBits(), Result))
+    return nullptr;
+
+  return ConstantInt::get(CI->getType(), Result);
+}
+
+static bool isLocallyOpenedFile(Value *File, CallInst *CI, IRBuilder<> &B,
+                                const TargetLibraryInfo *TLI) {
+  CallInst *FOpen = dyn_cast<CallInst>(File);
+  if (!FOpen)
+    return false;
+
+  Function *InnerCallee = FOpen->getCalledFunction();
+  if (!InnerCallee)
+    return false;
+
+  LibFunc Func;
+  if (!TLI->getLibFunc(*InnerCallee, Func) || !TLI->has(Func) ||
+      Func != LibFunc_fopen)
+    return false;
+
+  inferLibFuncAttributes(*CI->getCalledFunction(), *TLI);
+  if (PointerMayBeCaptured(File, true, true))
+    return false;
+
+  return true;
 }
 
 //===----------------------------------------------------------------------===//
@@ -156,9 +187,8 @@ Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len,
 
   // We have enough information to now generate the memcpy call to do the
   // concatenation for us.  Make a memcpy to copy the nul byte with align = 1.
-  B.CreateMemCpy(CpyDst, Src,
-                 ConstantInt::get(DL.getIntPtrType(Src->getContext()), Len + 1),
-                 1);
+  B.CreateMemCpy(CpyDst, 1, Src, 1,
+                 ConstantInt::get(DL.getIntPtrType(Src->getContext()), Len + 1));
   return Dst;
 }
 
@@ -346,8 +376,8 @@ Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilder<> &B) {
 
   // We have enough information to now generate the memcpy call to do the
   // copy for us.  Make a memcpy to copy the nul byte with align = 1.
-  B.CreateMemCpy(Dst, Src,
-                 ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len), 1);
+  B.CreateMemCpy(Dst, 1, Src, 1,
+                 ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len));
   return Dst;
 }
 
@@ -371,7 +401,7 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) {
 
   // We have enough information to now generate the memcpy call to do the
   // copy for us.  Make a memcpy to copy the nul byte with align = 1.
-  B.CreateMemCpy(Dst, Src, LenV, 1);
+  B.CreateMemCpy(Dst, 1, Src, 1, LenV);
   return DstEnd;
 }
 
@@ -388,7 +418,7 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) {
   --SrcLen;
 
   if (SrcLen == 0) {
-    // strncpy(x, "", y) -> memset(x, '\0', y, 1)
+    // strncpy(x, "", y) -> memset(align 1 x, '\0', y)
     B.CreateMemSet(Dst, B.getInt8('\0'), LenOp, 1);
     return Dst;
   }
@@ -407,8 +437,8 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) {
     return nullptr;
 
   Type *PT = Callee->getFunctionType()->getParamType(0);
-  // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant]
-  B.CreateMemCpy(Dst, Src, ConstantInt::get(DL.getIntPtrType(PT), Len), 1);
+  // strncpy(x, s, c) -> memcpy(align 1 x, align 1 s, c) [s and c are constant]
+  B.CreateMemCpy(Dst, 1, Src, 1, ConstantInt::get(DL.getIntPtrType(PT), Len));
 
   return Dst;
 }
@@ -508,7 +538,7 @@ Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeWcslen(CallInst *CI, IRBuilder<> &B) {
-  Module &M = *CI->getParent()->getParent()->getParent();
+  Module &M = *CI->getModule();
   unsigned WCharSize = TLI->getWCharSize(M) * 8;
   // We cannot perform this optimization without wchar_size metadata.
   if (WCharSize == 0)
@@ -816,40 +846,19 @@ Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilder<> &B) {
-  // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1)
-  B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
-                 CI->getArgOperand(2), 1);
+  // memcpy(x, y, n) -> llvm.memcpy(align 1 x, align 1 y, n)
+  B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1,
+                 CI->getArgOperand(2));
   return CI->getArgOperand(0);
 }
 
 Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilder<> &B) {
-  // memmove(x, y, n) -> llvm.memmove(x, y, n, 1)
-  B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
-                  CI->getArgOperand(2), 1);
+  // memmove(x, y, n) -> llvm.memmove(align 1 x, align 1 y, n)
+  B.CreateMemMove(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1,
+                  CI->getArgOperand(2));
   return CI->getArgOperand(0);
 }
 
-// TODO: Does this belong in BuildLibCalls or should all of those similar
-// functions be moved here?
-static Value *emitCalloc(Value *Num, Value *Size, const AttributeList &Attrs,
-                         IRBuilder<> &B, const TargetLibraryInfo &TLI) {
-  LibFunc Func;
-  if (!TLI.getLibFunc("calloc", Func) || !TLI.has(Func))
-    return nullptr;
-
-  Module *M = B.GetInsertBlock()->getModule();
-  const DataLayout &DL = M->getDataLayout();
-  IntegerType *PtrType = DL.getIntPtrType((B.GetInsertBlock()->getContext()));
-  Value *Calloc = M->getOrInsertFunction("calloc", Attrs, B.getInt8PtrTy(),
-                                         PtrType, PtrType);
-  CallInst *CI = B.CreateCall(Calloc, { Num, Size }, "calloc");
-
-  if (const auto *F = dyn_cast<Function>(Calloc->stripPointerCasts()))
-    CI->setCallingConv(F->getCallingConv());
-
-  return CI;
-}
-
 /// Fold memset[_chk](malloc(n), 0, n) --> calloc(1, n).
 static Value *foldMallocMemset(CallInst *Memset, IRBuilder<> &B,
                                const TargetLibraryInfo &TLI) {
@@ -901,12 +910,19 @@ Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilder<> &B) {
   if (auto *Calloc = foldMallocMemset(CI, B, *TLI))
     return Calloc;
 
-  // memset(p, v, n) -> llvm.memset(p, v, n, 1)
+  // memset(p, v, n) -> llvm.memset(align 1 p, v, n)
   Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
   B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
   return CI->getArgOperand(0);
 }
 
+Value *LibCallSimplifier::optimizeRealloc(CallInst *CI, IRBuilder<> &B) {
+  if (isa<ConstantPointerNull>(CI->getArgOperand(0)))
+    return emitMalloc(CI->getArgOperand(1), B, DL, TLI);
+
+  return nullptr;
+}
+
 //===----------------------------------------------------------------------===//
 // Math Library Optimizations
 //===----------------------------------------------------------------------===//
@@ -1666,12 +1682,12 @@ Value *LibCallSimplifier::optimizeFls(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeAbs(CallInst *CI, IRBuilder<> &B) {
-  // abs(x) -> x >s -1 ? x : -x
-  Value *Op = CI->getArgOperand(0);
-  Value *Pos =
-      B.CreateICmpSGT(Op, Constant::getAllOnesValue(Op->getType()), "ispos");
-  Value *Neg = B.CreateNeg(Op, "neg");
-  return B.CreateSelect(Pos, Op, Neg);
+  // abs(x) -> x <s 0 ? -x : x
+  // The negation has 'nsw' because abs of INT_MIN is undefined.
+  Value *X = CI->getArgOperand(0);
+  Value *IsNeg = B.CreateICmpSLT(X, Constant::getNullValue(X->getType()));
+  Value *NegX = B.CreateNSWNeg(X, "neg");
+  return B.CreateSelect(IsNeg, NegX, X);
 }
 
 Value *LibCallSimplifier::optimizeIsDigit(CallInst *CI, IRBuilder<> &B) {
@@ -1695,6 +1711,29 @@ Value *LibCallSimplifier::optimizeToAscii(CallInst *CI, IRBuilder<> &B) {
                      ConstantInt::get(CI->getType(), 0x7F));
 }
 
+Value *LibCallSimplifier::optimizeAtoi(CallInst *CI, IRBuilder<> &B) {
+  StringRef Str;
+  if (!getConstantStringInfo(CI->getArgOperand(0), Str))
+    return nullptr;
+
+  return convertStrToNumber(CI, Str, 10);
+}
+
+Value *LibCallSimplifier::optimizeStrtol(CallInst *CI, IRBuilder<> &B) {
+  StringRef Str;
+  if (!getConstantStringInfo(CI->getArgOperand(0), Str))
+    return nullptr;
+
+  if (!isa<ConstantPointerNull>(CI->getArgOperand(1)))
+    return nullptr;
+
+  if (ConstantInt *CInt = dyn_cast<ConstantInt>(CI->getArgOperand(2))) {
+    return convertStrToNumber(CI, Str, CInt->getSExtValue());
+  }
+
+  return nullptr;
+}
+
 //===----------------------------------------------------------------------===//
 // Formatting and IO Library Call Optimizations
 //===----------------------------------------------------------------------===//
@@ -1826,15 +1865,13 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) {
   if (CI->getNumArgOperands() == 2) {
     // Make sure there's no % in the constant array.  We could try to handle
     // %% -> % in the future if we cared.
-    for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
-      if (FormatStr[i] == '%')
-        return nullptr; // we found a format specifier, bail out.
+    if (FormatStr.find('%') != StringRef::npos)
+      return nullptr; // we found a format specifier, bail out.
 
-    // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1)
-    B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+    // sprintf(str, fmt) -> llvm.memcpy(align 1 str, align 1 fmt, strlen(fmt)+1)
+    B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1,
                    ConstantInt::get(DL.getIntPtrType(CI->getContext()),
-                                    FormatStr.size() + 1),
-                   1); // Copy the null byte.
+                                    FormatStr.size() + 1)); // Copy the null byte.
     return ConstantInt::get(CI->getType(), FormatStr.size());
   }
 
@@ -1868,7 +1905,7 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) {
       return nullptr;
     Value *IncLen =
         B.CreateAdd(Len, ConstantInt::get(Len->getType(), 1), "leninc");
-    B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(2), IncLen, 1);
+    B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(2), 1, IncLen);
 
     // The sprintf result is the unincremented number of bytes in the string.
     return B.CreateIntCast(Len, CI->getType(), false);
@@ -1897,6 +1934,93 @@ Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilder<> &B) {
   return nullptr;
 }
 
+Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI, IRBuilder<> &B) {
+  // Check for a fixed format string.
+  StringRef FormatStr;
+  if (!getConstantStringInfo(CI->getArgOperand(2), FormatStr))
+    return nullptr;
+
+  // Check for size
+  ConstantInt *Size = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  if (!Size)
+    return nullptr;
+
+  uint64_t N = Size->getZExtValue();
+
+  // If we just have a format string (nothing else crazy) transform it.
+  if (CI->getNumArgOperands() == 3) {
+    // Make sure there's no % in the constant array.  We could try to handle
+    // %% -> % in the future if we cared.
+    if (FormatStr.find('%') != StringRef::npos)
+      return nullptr; // we found a format specifier, bail out.
+
+    if (N == 0)
+      return ConstantInt::get(CI->getType(), FormatStr.size());
+    else if (N < FormatStr.size() + 1)
+      return nullptr;
+
+    // sprintf(str, size, fmt) -> llvm.memcpy(align 1 str, align 1 fmt,
+    // strlen(fmt)+1)
+    B.CreateMemCpy(
+        CI->getArgOperand(0), 1, CI->getArgOperand(2), 1,
+        ConstantInt::get(DL.getIntPtrType(CI->getContext()),
+                         FormatStr.size() + 1)); // Copy the null byte.
+    return ConstantInt::get(CI->getType(), FormatStr.size());
+  }
+
+  // The remaining optimizations require the format string to be "%s" or "%c"
+  // and have an extra operand.
+  if (FormatStr.size() == 2 && FormatStr[0] == '%' &&
+      CI->getNumArgOperands() == 4) {
+
+    // Decode the second character of the format string.
+    if (FormatStr[1] == 'c') {
+      if (N == 0)
+        return ConstantInt::get(CI->getType(), 1);
+      else if (N == 1)
+        return nullptr;
+
+      // snprintf(dst, size, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0
+      if (!CI->getArgOperand(3)->getType()->isIntegerTy())
+        return nullptr;
+      Value *V = B.CreateTrunc(CI->getArgOperand(3), B.getInt8Ty(), "char");
+      Value *Ptr = castToCStr(CI->getArgOperand(0), B);
+      B.CreateStore(V, Ptr);
+      Ptr = B.CreateGEP(B.getInt8Ty(), Ptr, B.getInt32(1), "nul");
+      B.CreateStore(B.getInt8(0), Ptr);
+
+      return ConstantInt::get(CI->getType(), 1);
+    }
+
+    if (FormatStr[1] == 's') {
+      // snprintf(dest, size, "%s", str) to llvm.memcpy(dest, str, len+1, 1)
+      StringRef Str;
+      if (!getConstantStringInfo(CI->getArgOperand(3), Str))
+        return nullptr;
+
+      if (N == 0)
+        return ConstantInt::get(CI->getType(), Str.size());
+      else if (N < Str.size() + 1)
+        return nullptr;
+
+      B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(3), 1,
+                     ConstantInt::get(CI->getType(), Str.size() + 1));
+
+      // The snprintf result is the unincremented number of bytes in the string.
+      return ConstantInt::get(CI->getType(), Str.size());
+    }
+  }
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeSnPrintF(CallInst *CI, IRBuilder<> &B) {
+  if (Value *V = optimizeSnPrintFString(CI, B)) {
+    return V;
+  }
+
+  return nullptr;
+}
+
 Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, IRBuilder<> &B) {
   optimizeErrorReporting(CI, B, 0);
 
@@ -1913,9 +2037,9 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, IRBuilder<> &B) {
 
   // fprintf(F, "foo") --> fwrite("foo", 3, 1, F)
   if (CI->getNumArgOperands() == 2) {
-    for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
-      if (FormatStr[i] == '%') // Could handle %% -> % if we cared.
-        return nullptr;        // We found a format specifier.
+    // Could handle %% -> % if we cared.
+    if (FormatStr.find('%') != StringRef::npos)
+      return nullptr; // We found a format specifier.
 
     return emitFWrite(
         CI->getArgOperand(1),
@@ -1973,22 +2097,27 @@ Value *LibCallSimplifier::optimizeFWrite(CallInst *CI, IRBuilder<> &B) {
   // Get the element size and count.
   ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
   ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
-  if (!SizeC || !CountC)
-    return nullptr;
-  uint64_t Bytes = SizeC->getZExtValue() * CountC->getZExtValue();
-
-  // If this is writing zero records, remove the call (it's a noop).
-  if (Bytes == 0)
-    return ConstantInt::get(CI->getType(), 0);
-
-  // If this is writing one byte, turn it into fputc.
-  // This optimisation is only valid, if the return value is unused.
-  if (Bytes == 1 && CI->use_empty()) { // fwrite(S,1,1,F) -> fputc(S[0],F)
-    Value *Char = B.CreateLoad(castToCStr(CI->getArgOperand(0), B), "char");
-    Value *NewCI = emitFPutC(Char, CI->getArgOperand(3), B, TLI);
-    return NewCI ? ConstantInt::get(CI->getType(), 1) : nullptr;
+  if (SizeC && CountC) {
+    uint64_t Bytes = SizeC->getZExtValue() * CountC->getZExtValue();
+
+    // If this is writing zero records, remove the call (it's a noop).
+    if (Bytes == 0)
+      return ConstantInt::get(CI->getType(), 0);
+
+    // If this is writing one byte, turn it into fputc.
+    // This optimisation is only valid, if the return value is unused.
+    if (Bytes == 1 && CI->use_empty()) { // fwrite(S,1,1,F) -> fputc(S[0],F)
+      Value *Char = B.CreateLoad(castToCStr(CI->getArgOperand(0), B), "char");
+      Value *NewCI = emitFPutC(Char, CI->getArgOperand(3), B, TLI);
+      return NewCI ? ConstantInt::get(CI->getType(), 1) : nullptr;
+    }
   }
 
+  if (isLocallyOpenedFile(CI->getArgOperand(3), CI, B, TLI))
+    return emitFWriteUnlocked(CI->getArgOperand(0), CI->getArgOperand(1),
+                              CI->getArgOperand(2), CI->getArgOperand(3), B, DL,
+                              TLI);
+
   return nullptr;
 }
 
@@ -1997,12 +2126,18 @@ Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) {
 
   // Don't rewrite fputs to fwrite when optimising for size because fwrite
   // requires more arguments and thus extra MOVs are required.
-  if (CI->getParent()->getParent()->optForSize())
+  if (CI->getFunction()->optForSize())
     return nullptr;
 
-  // We can't optimize if return value is used.
-  if (!CI->use_empty())
-    return nullptr;
+  // Check if has any use
+  if (!CI->use_empty()) {
+    if (isLocallyOpenedFile(CI->getArgOperand(1), CI, B, TLI))
+      return emitFPutSUnlocked(CI->getArgOperand(0), CI->getArgOperand(1), B,
+                               TLI);
+    else
+      // We can't optimize if return value is used.
+      return nullptr;
+  }
 
   // fputs(s,F) --> fwrite(s,1,strlen(s),F)
   uint64_t Len = GetStringLength(CI->getArgOperand(0));
@@ -2016,6 +2151,40 @@ Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) {
       CI->getArgOperand(1), B, DL, TLI);
 }
 
+Value *LibCallSimplifier::optimizeFPutc(CallInst *CI, IRBuilder<> &B) {
+  optimizeErrorReporting(CI, B, 1);
+
+  if (isLocallyOpenedFile(CI->getArgOperand(1), CI, B, TLI))
+    return emitFPutCUnlocked(CI->getArgOperand(0), CI->getArgOperand(1), B,
+                             TLI);
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeFGetc(CallInst *CI, IRBuilder<> &B) {
+  if (isLocallyOpenedFile(CI->getArgOperand(0), CI, B, TLI))
+    return emitFGetCUnlocked(CI->getArgOperand(0), B, TLI);
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeFGets(CallInst *CI, IRBuilder<> &B) {
+  if (isLocallyOpenedFile(CI->getArgOperand(2), CI, B, TLI))
+    return emitFGetSUnlocked(CI->getArgOperand(0), CI->getArgOperand(1),
+                             CI->getArgOperand(2), B, TLI);
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeFRead(CallInst *CI, IRBuilder<> &B) {
+  if (isLocallyOpenedFile(CI->getArgOperand(3), CI, B, TLI))
+    return emitFReadUnlocked(CI->getArgOperand(0), CI->getArgOperand(1),
+                             CI->getArgOperand(2), CI->getArgOperand(3), B, DL,
+                             TLI);
+
+  return nullptr;
+}
+
 Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilder<> &B) {
   // Check for a constant string.
   StringRef Str;
@@ -2099,6 +2268,8 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
       return optimizeMemMove(CI, Builder);
     case LibFunc_memset:
       return optimizeMemSet(CI, Builder);
+    case LibFunc_realloc:
+      return optimizeRealloc(CI, Builder);
     case LibFunc_wcslen:
       return optimizeWcslen(CI, Builder);
     default:
@@ -2290,16 +2461,33 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
       return optimizeIsAscii(CI, Builder);
     case LibFunc_toascii:
       return optimizeToAscii(CI, Builder);
+    case LibFunc_atoi:
+    case LibFunc_atol:
+    case LibFunc_atoll:
+      return optimizeAtoi(CI, Builder);
+    case LibFunc_strtol:
+    case LibFunc_strtoll:
+      return optimizeStrtol(CI, Builder);
     case LibFunc_printf:
       return optimizePrintF(CI, Builder);
     case LibFunc_sprintf:
       return optimizeSPrintF(CI, Builder);
+    case LibFunc_snprintf:
+      return optimizeSnPrintF(CI, Builder);
     case LibFunc_fprintf:
       return optimizeFPrintF(CI, Builder);
     case LibFunc_fwrite:
       return optimizeFWrite(CI, Builder);
+    case LibFunc_fread:
+      return optimizeFRead(CI, Builder);
     case LibFunc_fputs:
       return optimizeFPuts(CI, Builder);
+    case LibFunc_fgets:
+      return optimizeFGets(CI, Builder);
+    case LibFunc_fputc:
+      return optimizeFPutc(CI, Builder);
+    case LibFunc_fgetc:
+      return optimizeFGetc(CI, Builder);
     case LibFunc_puts:
       return optimizePuts(CI, Builder);
     case LibFunc_perror:
@@ -2307,8 +2495,6 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
     case LibFunc_vfprintf:
     case LibFunc_fiprintf:
       return optimizeErrorReporting(CI, Builder, 0);
-    case LibFunc_fputc:
-      return optimizeErrorReporting(CI, Builder, 1);
     default:
       return nullptr;
     }
@@ -2393,8 +2579,8 @@ bool FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI,
 Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI,
                                                      IRBuilder<> &B) {
   if (isFortifiedCallFoldable(CI, 3, 2, false)) {
-    B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
-                   CI->getArgOperand(2), 1);
+    B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1,
+                   CI->getArgOperand(2));
     return CI->getArgOperand(0);
   }
   return nullptr;
@@ -2403,8 +2589,8 @@ Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI,
 Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI,
                                                       IRBuilder<> &B) {
   if (isFortifiedCallFoldable(CI, 3, 2, false)) {
-    B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
-                    CI->getArgOperand(2), 1);
+    B.CreateMemMove(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1,
+                    CI->getArgOperand(2));
     return CI->getArgOperand(0);
   }
   return nullptr;
diff --git a/lib/Transforms/Utils/SplitModule.cpp b/lib/Transforms/Utils/SplitModule.cpp
index 968eb0208f43..f8d758c54983 100644
--- a/lib/Transforms/Utils/SplitModule.cpp
+++ b/lib/Transforms/Utils/SplitModule.cpp
@@ -101,7 +101,8 @@ static void findPartitions(Module *M, ClusterIDMapType &ClusterIDMap,
   // At this point module should have the proper mix of globals and locals.
   // As we attempt to partition this module, we must not change any
   // locals to globals.
-  DEBUG(dbgs() << "Partition module with (" << M->size() << ")functions\n");
+  LLVM_DEBUG(dbgs() << "Partition module with (" << M->size()
+                    << ")functions\n");
   ClusterMapType GVtoClusterMap;
   ComdatMembersType ComdatMembers;
 
@@ -180,28 +181,31 @@ static void findPartitions(Module *M, ClusterIDMapType &ClusterIDMap,
           std::make_pair(std::distance(GVtoClusterMap.member_begin(I),
                                        GVtoClusterMap.member_end()), I));
 
-  std::sort(Sets.begin(), Sets.end(), [](const SortType &a, const SortType &b) {
-    if (a.first == b.first)
-      return a.second->getData()->getName() > b.second->getData()->getName();
-    else
-      return a.first > b.first;
-  });
+  llvm::sort(Sets.begin(), Sets.end(),
+             [](const SortType &a, const SortType &b) {
+               if (a.first == b.first)
+                 return a.second->getData()->getName() >
+                        b.second->getData()->getName();
+               else
+                 return a.first > b.first;
+             });
 
   for (auto &I : Sets) {
     unsigned CurrentClusterID = BalancinQueue.top().first;
     unsigned CurrentClusterSize = BalancinQueue.top().second;
     BalancinQueue.pop();
 
-    DEBUG(dbgs() << "Root[" << CurrentClusterID << "] cluster_size(" << I.first
-                 << ") ----> " << I.second->getData()->getName() << "\n");
+    LLVM_DEBUG(dbgs() << "Root[" << CurrentClusterID << "] cluster_size("
+                      << I.first << ") ----> " << I.second->getData()->getName()
+                      << "\n");
 
     for (ClusterMapType::member_iterator MI =
              GVtoClusterMap.findLeader(I.second);
          MI != GVtoClusterMap.member_end(); ++MI) {
       if (!Visited.insert(*MI).second)
         continue;
-      DEBUG(dbgs() << "----> " << (*MI)->getName()
-                   << ((*MI)->hasLocalLinkage() ? " l " : " e ") << "\n");
+      LLVM_DEBUG(dbgs() << "----> " << (*MI)->getName()
+                        << ((*MI)->hasLocalLinkage() ? " l " : " e ") << "\n");
       Visited.insert(*MI);
       ClusterIDMap[*MI] = CurrentClusterID;
       CurrentClusterSize++;
@@ -270,7 +274,7 @@ void llvm::SplitModule(
   for (unsigned I = 0; I < N; ++I) {
     ValueToValueMapTy VMap;
     std::unique_ptr<Module> MPart(
-        CloneModule(M.get(), VMap, [&](const GlobalValue *GV) {
+        CloneModule(*M, VMap, [&](const GlobalValue *GV) {
           if (ClusterIDMap.count(GV))
             return (ClusterIDMap[GV] == I);
           else
diff --git a/lib/Transforms/Utils/StripGCRelocates.cpp b/lib/Transforms/Utils/StripGCRelocates.cpp
index 49dc15cf5e7c..ac0b519f4a77 100644
--- a/lib/Transforms/Utils/StripGCRelocates.cpp
+++ b/lib/Transforms/Utils/StripGCRelocates.cpp
@@ -21,7 +21,6 @@
 #include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
 
 using namespace llvm;
 
@@ -75,6 +74,3 @@ bool StripGCRelocates::runOnFunction(Function &F) {
 INITIALIZE_PASS(StripGCRelocates, "strip-gc-relocates",
                 "Strip gc.relocates inserted through RewriteStatepointsForGC",
                 true, false)
-FunctionPass *llvm::createStripGCRelocatesPass() {
-  return new StripGCRelocates();
-}
diff --git a/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp b/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp
index cd0378e0140c..8956a089a99c 100644
--- a/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp
+++ b/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/Pass.h"
-#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils.h"
 using namespace llvm;
 
 namespace {
diff --git a/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
index ed444e4cf43c..e633ac0c874d 100644
--- a/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
+++ b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
@@ -19,7 +19,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Type.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
 using namespace llvm;
 
 char UnifyFunctionExitNodes::ID = 0;
diff --git a/lib/Transforms/Utils/Utils.cpp b/lib/Transforms/Utils/Utils.cpp
index f6c7d1c4989e..afd842f59911 100644
--- a/lib/Transforms/Utils/Utils.cpp
+++ b/lib/Transforms/Utils/Utils.cpp
@@ -12,7 +12,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Utils.h"
 #include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/Utils.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/PassRegistry.h"
 
@@ -33,7 +36,6 @@ void llvm::initializeTransformUtils(PassRegistry &Registry) {
   initializePromoteLegacyPassPass(Registry);
   initializeStripNonLineTableDebugInfoPass(Registry);
   initializeUnifyFunctionExitNodesPass(Registry);
-  initializeInstSimplifierPass(Registry);
   initializeMetaRenamerPass(Registry);
   initializeStripGCRelocatesPass(Registry);
   initializePredicateInfoPrinterLegacyPassPass(Registry);
@@ -43,3 +45,12 @@ void llvm::initializeTransformUtils(PassRegistry &Registry) {
 void LLVMInitializeTransformUtils(LLVMPassRegistryRef R) {
   initializeTransformUtils(*unwrap(R));
 }
+
+void LLVMAddLowerSwitchPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLowerSwitchPass());
+}
+
+void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createPromoteMemoryToRegisterPass());
+}
+
diff --git a/lib/Transforms/Utils/VNCoercion.cpp b/lib/Transforms/Utils/VNCoercion.cpp
index c3feea6a0a41..948d9bd5baad 100644
--- a/lib/Transforms/Utils/VNCoercion.cpp
+++ b/lib/Transforms/Utils/VNCoercion.cpp
@@ -20,8 +20,14 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
       StoredVal->getType()->isStructTy() || StoredVal->getType()->isArrayTy())
     return false;
 
+  uint64_t StoreSize = DL.getTypeSizeInBits(StoredVal->getType());
+
+  // The store size must be byte-aligned to support future type casts.
+  if (llvm::alignTo(StoreSize, 8) != StoreSize)
+    return false;
+
   // The store has to be at least as big as the load.
-  if (DL.getTypeSizeInBits(StoredVal->getType()) < DL.getTypeSizeInBits(LoadTy))
+  if (StoreSize < DL.getTypeSizeInBits(LoadTy))
     return false;
 
   // Don't coerce non-integral pointers to integers or vice versa.
@@ -389,8 +395,8 @@ Value *getLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy,
     NewLoad->takeName(SrcVal);
     NewLoad->setAlignment(SrcVal->getAlignment());
 
-    DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n");
-    DEBUG(dbgs() << "TO: " << *NewLoad << "\n");
+    LLVM_DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n");
+    LLVM_DEBUG(dbgs() << "TO: " << *NewLoad << "\n");
 
     // Replace uses of the original load with the wider load.  On a big endian
     // system, we need to shift down to get the relevant bits.
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index 8c9ecbc3503e..55fff3f3872a 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
@@ -536,13 +537,23 @@ Optional<Metadata *> MDNodeMapper::tryToMapOperand(const Metadata *Op) {
   return None;
 }
 
+static Metadata *cloneOrBuildODR(const MDNode &N) {
+  auto *CT = dyn_cast<DICompositeType>(&N);
+  // If ODR type uniquing is enabled, we would have uniqued composite types
+  // with identifiers during bitcode reading, so we can just use CT.
+  if (CT && CT->getContext().isODRUniquingDebugTypes() &&
+      CT->getIdentifier() != "")
+    return const_cast<DICompositeType *>(CT);
+  return MDNode::replaceWithDistinct(N.clone());
+}
+
 MDNode *MDNodeMapper::mapDistinctNode(const MDNode &N) {
   assert(N.isDistinct() && "Expected a distinct node");
   assert(!M.getVM().getMappedMD(&N) && "Expected an unmapped node");
-  DistinctWorklist.push_back(cast<MDNode>(
-      (M.Flags & RF_MoveDistinctMDs)
-          ? M.mapToSelf(&N)
-          : M.mapToMetadata(&N, MDNode::replaceWithDistinct(N.clone()))));
+  DistinctWorklist.push_back(
+      cast<MDNode>((M.Flags & RF_MoveDistinctMDs)
+                       ? M.mapToSelf(&N)
+                       : M.mapToMetadata(&N, cloneOrBuildODR(N))));
   return DistinctWorklist.back();
 }