40 files changed, 7781 insertions, 2941 deletions
diff --git a/lib/Transforms/Utils/ASanStackFrameLayout.cpp b/lib/Transforms/Utils/ASanStackFrameLayout.cpp
index 409326eba401f..7e50d4bb447ef 100644
--- a/lib/Transforms/Utils/ASanStackFrameLayout.cpp
+++ b/lib/Transforms/Utils/ASanStackFrameLayout.cpp
@@ -44,7 +44,7 @@ static size_t VarAndRedzoneSize(size_t Size, size_t Alignment) {
   else if (Size <= 512) Res = Size + 64;
   else if (Size <= 4096) Res = Size + 128;
   else                   Res = Size + 256;
-  return RoundUpToAlignment(Res, Alignment);
+  return alignTo(Res, Alignment);
 }
 
 void
diff --git a/lib/Transforms/Utils/AddDiscriminators.cpp b/lib/Transforms/Utils/AddDiscriminators.cpp
index 0262358fa3d57..d034905b6572b 100644
--- a/lib/Transforms/Utils/AddDiscriminators.cpp
+++ b/lib/Transforms/Utils/AddDiscriminators.cpp
@@ -52,7 +52,9 @@
 // http://wiki.dwarfstd.org/index.php?title=Path_Discriminators
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Utils/AddDiscriminators.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DIBuilder.h"
@@ -72,20 +74,22 @@ using namespace llvm;
 #define DEBUG_TYPE "add-discriminators"
 
 namespace {
-struct AddDiscriminators : public FunctionPass {
+// The legacy pass of AddDiscriminators.
+struct AddDiscriminatorsLegacyPass : public FunctionPass {
   static char ID; // Pass identification, replacement for typeid
-  AddDiscriminators() : FunctionPass(ID) {
-    initializeAddDiscriminatorsPass(*PassRegistry::getPassRegistry());
+  AddDiscriminatorsLegacyPass() : FunctionPass(ID) {
+    initializeAddDiscriminatorsLegacyPassPass(*PassRegistry::getPassRegistry());
   }
 
   bool runOnFunction(Function &F) override;
 };
-}
 
-char AddDiscriminators::ID = 0;
-INITIALIZE_PASS_BEGIN(AddDiscriminators, "add-discriminators",
+} // end anonymous namespace
+
+char AddDiscriminatorsLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(AddDiscriminatorsLegacyPass, "add-discriminators",
                       "Add DWARF path discriminators", false, false)
-INITIALIZE_PASS_END(AddDiscriminators, "add-discriminators",
+INITIALIZE_PASS_END(AddDiscriminatorsLegacyPass, "add-discriminators",
                     "Add DWARF path discriminators", false, false)
 
 // Command line option to disable discriminator generation even in the
@@ -95,13 +99,9 @@ static cl::opt<bool> NoDiscriminators(
     "no-discriminators", cl::init(false),
     cl::desc("Disable generation of discriminator information."));
 
+// Create the legacy AddDiscriminatorsPass.
 FunctionPass *llvm::createAddDiscriminatorsPass() {
-  return new AddDiscriminators();
-}
-
-static bool hasDebugInfo(const Function &F) {
-  DISubprogram *S = getDISubprogram(&F);
-  return S != nullptr;
+  return new AddDiscriminatorsLegacyPass();
 }
 
 /// \brief Assign DWARF discriminators.
@@ -155,13 +155,13 @@ static bool hasDebugInfo(const Function &F) {
 /// lexical block for I2 and all the instruction in B2 that share the same
 /// file and line location as I2. This new lexical block will have a
 /// different discriminator number than I1.
-bool AddDiscriminators::runOnFunction(Function &F) {
+static bool addDiscriminators(Function &F) {
   // If the function has debug information, but the user has disabled
   // discriminators, do nothing.
   // Simlarly, if the function has no debug info, do nothing.
   // Finally, if this module is built with dwarf versions earlier than 4,
   // do nothing (discriminator support is a DWARF 4 feature).
-  if (NoDiscriminators || !hasDebugInfo(F) ||
+  if (NoDiscriminators || !F.getSubprogram() ||
       F.getParent()->getDwarfVersion() < 4)
     return false;
 
@@ -173,8 +173,11 @@ bool AddDiscriminators::runOnFunction(Function &F) {
   typedef std::pair<StringRef, unsigned> Location;
   typedef DenseMap<const BasicBlock *, Metadata *> BBScopeMap;
   typedef DenseMap<Location, BBScopeMap> LocationBBMap;
+  typedef DenseMap<Location, unsigned> LocationDiscriminatorMap;
+  typedef DenseSet<Location> LocationSet;
 
   LocationBBMap LBM;
+  LocationDiscriminatorMap LDM;
 
   // Traverse all instructions in the function. If the source line location
   // of the instruction appears in other basic block, assign a new
@@ -199,8 +202,7 @@ bool AddDiscriminators::runOnFunction(Function &F) {
         auto *Scope = DIL->getScope();
         auto *File =
             Builder.createFile(DIL->getFilename(), Scope->getDirectory());
-        NewScope = Builder.createLexicalBlockFile(
-            Scope, File, DIL->computeNewDiscriminator());
+        NewScope = Builder.createLexicalBlockFile(Scope, File, ++LDM[L]);
       }
       I.setDebugLoc(DILocation::get(Ctx, DIL->getLine(), DIL->getColumn(),
                                     NewScope, DIL->getInlinedAt()));
@@ -217,32 +219,40 @@ bool AddDiscriminators::runOnFunction(Function &F) {
   // Sample base profile needs to distinguish different function calls within
   // a same source line for correct profile annotation.
   for (BasicBlock &B : F) {
-    const DILocation *FirstDIL = NULL;
+    LocationSet CallLocations;
     for (auto &I : B.getInstList()) {
       CallInst *Current = dyn_cast<CallInst>(&I);
       if (!Current || isa<DbgInfoIntrinsic>(&I))
         continue;
 
       DILocation *CurrentDIL = Current->getDebugLoc();
-      if (FirstDIL) {
-        if (CurrentDIL && CurrentDIL->getLine() == FirstDIL->getLine() &&
-            CurrentDIL->getFilename() == FirstDIL->getFilename()) {
-          auto *Scope = FirstDIL->getScope();
-          auto *File = Builder.createFile(FirstDIL->getFilename(),
-                                          Scope->getDirectory());
-          auto *NewScope = Builder.createLexicalBlockFile(
-              Scope, File, FirstDIL->computeNewDiscriminator());
-          Current->setDebugLoc(DILocation::get(
-              Ctx, CurrentDIL->getLine(), CurrentDIL->getColumn(), NewScope,
-              CurrentDIL->getInlinedAt()));
-          Changed = true;
-        } else {
-          FirstDIL = CurrentDIL;
-        }
-      } else {
-        FirstDIL = CurrentDIL;
+      if (!CurrentDIL)
+        continue;
+      Location L =
+          std::make_pair(CurrentDIL->getFilename(), CurrentDIL->getLine());
+      if (!CallLocations.insert(L).second) {
+        auto *Scope = CurrentDIL->getScope();
+        auto *File = Builder.createFile(CurrentDIL->getFilename(),
+                                        Scope->getDirectory());
+        auto *NewScope = Builder.createLexicalBlockFile(Scope, File, ++LDM[L]);
+        Current->setDebugLoc(DILocation::get(Ctx, CurrentDIL->getLine(),
+                                             CurrentDIL->getColumn(), NewScope,
+                                             CurrentDIL->getInlinedAt()));
+        Changed = true;
       }
     }
   }
   return Changed;
 }
+
+bool AddDiscriminatorsLegacyPass::runOnFunction(Function &F) {
+  return addDiscriminators(F);
+}
+PreservedAnalyses AddDiscriminatorsPass::run(Function &F,
+                                             AnalysisManager<Function> &AM) {
+  if (!addDiscriminators(F))
+    return PreservedAnalyses::all();
+
+  // FIXME: should be all()
+  return PreservedAnalyses::none();
+}
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index 72db980cf572a..b90349d3cdad1 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -31,8 +31,6 @@
 #include <algorithm>
 using namespace llvm;
 
-/// DeleteDeadBlock - Delete the specified block, which must have no
-/// predecessors.
 void llvm::DeleteDeadBlock(BasicBlock *BB) {
   assert((pred_begin(BB) == pred_end(BB) ||
          // Can delete self loop.
@@ -61,12 +59,8 @@ void llvm::DeleteDeadBlock(BasicBlock *BB) {
   BB->eraseFromParent();
 }
 
-/// FoldSingleEntryPHINodes - We know that BB has one predecessor.  If there are
-/// any single-entry PHI nodes in it, fold them away.  This handles the case
-/// when all entries to the PHI nodes in a block are guaranteed equal, such as
-/// when the block has exactly one predecessor.
 void llvm::FoldSingleEntryPHINodes(BasicBlock *BB,
-                                   MemoryDependenceAnalysis *MemDep) {
+                                   MemoryDependenceResults *MemDep) {
   if (!isa<PHINode>(BB->begin())) return;
 
   while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
@@ -82,11 +76,6 @@ void llvm::FoldSingleEntryPHINodes(BasicBlock *BB,
   }
 }
 
-
-/// DeleteDeadPHIs - Examine each PHI in the given block and delete it if it
-/// is dead. Also recursively delete any operands that become dead as
-/// a result. This includes tracing the def-use list from the PHI to see if
-/// it is ultimately unused or if it reaches an unused cycle.
 bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) {
   // Recursively deleting a PHI may cause multiple PHIs to be deleted
   // or RAUW'd undef, so use an array of WeakVH for the PHIs to delete.
@@ -103,11 +92,9 @@ bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) {
   return Changed;
 }
 
-/// MergeBlockIntoPredecessor - Attempts to merge a block into its predecessor,
-/// if possible.  The return value indicates success or failure.
 bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT,
                                      LoopInfo *LI,
-                                     MemoryDependenceAnalysis *MemDep) {
+                                     MemoryDependenceResults *MemDep) {
   // Don't merge away blocks who have their address taken.
   if (BB->hasAddressTaken()) return false;
 
@@ -165,10 +152,8 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT,
     if (DomTreeNode *DTN = DT->getNode(BB)) {
       DomTreeNode *PredDTN = DT->getNode(PredBB);
       SmallVector<DomTreeNode *, 8> Children(DTN->begin(), DTN->end());
-      for (SmallVectorImpl<DomTreeNode *>::iterator DI = Children.begin(),
-                                                    DE = Children.end();
-           DI != DE; ++DI)
-        DT->changeImmediateDominator(*DI, PredDTN);
+      for (DomTreeNode *DI : Children)
+        DT->changeImmediateDominator(DI, PredDTN);
 
       DT->eraseNode(BB);
     }
@@ -183,9 +168,6 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DominatorTree *DT,
   return true;
 }
 
-/// ReplaceInstWithValue - Replace all uses of an instruction (specified by BI)
-/// with a value, then remove and delete the original instruction.
-///
 void llvm::ReplaceInstWithValue(BasicBlock::InstListType &BIL,
                                 BasicBlock::iterator &BI, Value *V) {
   Instruction &I = *BI;
@@ -200,11 +182,6 @@ void llvm::ReplaceInstWithValue(BasicBlock::InstListType &BIL,
   BI = BIL.erase(BI);
 }
 
-
-/// ReplaceInstWithInst - Replace the instruction specified by BI with the
-/// instruction specified by I.  The original instruction is deleted and BI is
-/// updated to point to the new instruction.
-///
 void llvm::ReplaceInstWithInst(BasicBlock::InstListType &BIL,
                                BasicBlock::iterator &BI, Instruction *I) {
   assert(I->getParent() == nullptr &&
@@ -225,16 +202,11 @@ void llvm::ReplaceInstWithInst(BasicBlock::InstListType &BIL,
   BI = New;
 }
 
-/// ReplaceInstWithInst - Replace the instruction specified by From with the
-/// instruction specified by To.
-///
 void llvm::ReplaceInstWithInst(Instruction *From, Instruction *To) {
   BasicBlock::iterator BI(From);
   ReplaceInstWithInst(From->getParent()->getInstList(), BI, To);
 }
 
-/// SplitEdge -  Split the edge connecting specified block. Pass P must
-/// not be NULL.
 BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT,
                             LoopInfo *LI) {
   unsigned SuccNum = GetSuccessorNumber(BB, Succ);
@@ -266,8 +238,8 @@ unsigned
 llvm::SplitAllCriticalEdges(Function &F,
                             const CriticalEdgeSplittingOptions &Options) {
   unsigned NumBroken = 0;
-  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
-    TerminatorInst *TI = I->getTerminator();
+  for (BasicBlock &BB : F) {
+    TerminatorInst *TI = BB.getTerminator();
     if (TI->getNumSuccessors() > 1 && !isa<IndirectBrInst>(TI))
       for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
         if (SplitCriticalEdge(TI, i, Options))
@@ -276,11 +248,6 @@ llvm::SplitAllCriticalEdges(Function &F,
   return NumBroken;
 }
 
-/// SplitBlock - Split the specified block at the specified instruction - every
-/// thing before SplitPt stays in Old and everything starting with SplitPt moves
-/// to a new block.  The two blocks are joined by an unconditional branch and
-/// the loop info is updated.
-///
 BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt,
                              DominatorTree *DT, LoopInfo *LI) {
   BasicBlock::iterator SplitIt = SplitPt->getIterator();
@@ -297,22 +264,17 @@ BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt,
   if (DT)
     // Old dominates New. New node dominates all other nodes dominated by Old.
     if (DomTreeNode *OldNode = DT->getNode(Old)) {
-      std::vector<DomTreeNode *> Children;
-      for (DomTreeNode::iterator I = OldNode->begin(), E = OldNode->end();
-           I != E; ++I)
-        Children.push_back(*I);
+      std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end());
 
       DomTreeNode *NewNode = DT->addNewBlock(New, Old);
-      for (std::vector<DomTreeNode *>::iterator I = Children.begin(),
-             E = Children.end(); I != E; ++I)
-        DT->changeImmediateDominator(*I, NewNode);
+      for (DomTreeNode *I : Children)
+        DT->changeImmediateDominator(I, NewNode);
     }
 
   return New;
 }
 
-/// UpdateAnalysisInformation - Update DominatorTree, LoopInfo, and LCCSA
-/// analysis information.
+/// Update DominatorTree, LoopInfo, and LCCSA analysis information.
 static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
                                       ArrayRef<BasicBlock *> Preds,
                                       DominatorTree *DT, LoopInfo *LI,
@@ -331,10 +293,7 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
   // this split will affect loops.
   bool IsLoopEntry = !!L;
   bool SplitMakesNewLoopHeader = false;
-  for (ArrayRef<BasicBlock *>::iterator i = Preds.begin(), e = Preds.end();
-       i != e; ++i) {
-    BasicBlock *Pred = *i;
-
+  for (BasicBlock *Pred : Preds) {
     // If we need to preserve LCSSA, determine if any of the preds is a loop
     // exit.
     if (PreserveLCSSA)
@@ -362,9 +321,7 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
     // loops enclose them, and select the most-nested loop which contains the
     // loop containing the block being split.
     Loop *InnermostPredLoop = nullptr;
-    for (ArrayRef<BasicBlock*>::iterator
-           i = Preds.begin(), e = Preds.end(); i != e; ++i) {
-      BasicBlock *Pred = *i;
+    for (BasicBlock *Pred : Preds) {
       if (Loop *PredLoop = LI->getLoopFor(Pred)) {
         // Seek a loop which actually contains the block being split (to avoid
         // adjacent loops).
@@ -388,8 +345,8 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
   }
 }
 
-/// UpdatePHINodes - Update the PHI nodes in OrigBB to include the values coming
-/// from NewBB. This also updates AliasAnalysis, if available.
+/// Update the PHI nodes in OrigBB to include the values coming from NewBB.
+/// This also updates AliasAnalysis, if available.
 static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
                            ArrayRef<BasicBlock *> Preds, BranchInst *BI,
                            bool HasLoopExit) {
@@ -456,21 +413,6 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
   }
 }
 
-/// SplitBlockPredecessors - This method introduces at least one new basic block
-/// into the function and moves some of the predecessors of BB to be
-/// predecessors of the new block. The new predecessors are indicated by the
-/// Preds array. The new block is given a suffix of 'Suffix'. Returns new basic
-/// block to which predecessors from Preds are now pointing.
-///
-/// If BB is a landingpad block then additional basicblock might be introduced.
-/// It will have suffix of 'Suffix'+".split_lp".
-/// See SplitLandingPadPredecessors for more details on this case.
-///
-/// This currently updates the LLVM IR, AliasAnalysis, DominatorTree,
-/// LoopInfo, and LCCSA but no other analyses. In particular, it does not
-/// preserve LoopSimplify (because it's complicated to handle the case where one
-/// of the edges being split is an exit of a loop with other exits).
-///
 BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
                                          ArrayRef<BasicBlock *> Preds,
                                          const char *Suffix, DominatorTree *DT,
@@ -529,19 +471,6 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
   return NewBB;
 }
 
-/// SplitLandingPadPredecessors - This method transforms the landing pad,
-/// OrigBB, by introducing two new basic blocks into the function. One of those
-/// new basic blocks gets the predecessors listed in Preds. The other basic
-/// block gets the remaining predecessors of OrigBB. The landingpad instruction
-/// OrigBB is clone into both of the new basic blocks. The new blocks are given
-/// the suffixes 'Suffix1' and 'Suffix2', and are returned in the NewBBs vector.
-///
-/// This currently updates the LLVM IR, AliasAnalysis, DominatorTree,
-/// DominanceFrontier, LoopInfo, and LCCSA but no other analyses. In particular,
-/// it does not preserve LoopSimplify (because it's complicated to handle the
-/// case where one of the edges being split is an exit of a loop with other
-/// exits).
-///
 void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
                                        ArrayRef<BasicBlock *> Preds,
                                        const char *Suffix1, const char *Suffix2,
@@ -603,9 +532,8 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
     BI2->setDebugLoc(OrigBB->getFirstNonPHI()->getDebugLoc());
 
     // Move the remaining edges from OrigBB to point to NewBB2.
-    for (SmallVectorImpl<BasicBlock*>::iterator
-           i = NewBB2Preds.begin(), e = NewBB2Preds.end(); i != e; ++i)
-      (*i)->getTerminator()->replaceUsesOfWith(OrigBB, NewBB2);
+    for (BasicBlock *NewBB2Pred : NewBB2Preds)
+      NewBB2Pred->getTerminator()->replaceUsesOfWith(OrigBB, NewBB2);
 
     // Update DominatorTree, LoopInfo, and LCCSA analysis information.
     HasLoopExit = false;
@@ -646,11 +574,6 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
   }
 }
 
-/// FoldReturnIntoUncondBranch - This method duplicates the specified return
-/// instruction into a predecessor which ends in an unconditional branch. If
-/// the return instruction returns a value defined by a PHI, propagate the
-/// right value into the return. It returns the new return instruction in the
-/// predecessor.
 ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
                                              BasicBlock *Pred) {
   Instruction *UncondBranch = Pred->getTerminator();
@@ -689,31 +612,10 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
   return cast<ReturnInst>(NewRet);
 }
 
-/// SplitBlockAndInsertIfThen - Split the containing block at the
-/// specified instruction - everything before and including SplitBefore stays
-/// in the old basic block, and everything after SplitBefore is moved to a
-/// new block. The two blocks are connected by a conditional branch
-/// (with value of Cmp being the condition).
-/// Before:
-///   Head
-///   SplitBefore
-///   Tail
-/// After:
-///   Head
-///   if (Cond)
-///     ThenBlock
-///   SplitBefore
-///   Tail
-///
-/// If Unreachable is true, then ThenBlock ends with
-/// UnreachableInst, otherwise it branches to Tail.
-/// Returns the NewBasicBlock's terminator.
-
-TerminatorInst *llvm::SplitBlockAndInsertIfThen(Value *Cond,
-                                                Instruction *SplitBefore,
-                                                bool Unreachable,
-                                                MDNode *BranchWeights,
-                                                DominatorTree *DT) {
+TerminatorInst *
+llvm::SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore,
+                                bool Unreachable, MDNode *BranchWeights,
+                                DominatorTree *DT, LoopInfo *LI) {
   BasicBlock *Head = SplitBefore->getParent();
   BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator());
   TerminatorInst *HeadOldTerm = Head->getTerminator();
@@ -735,7 +637,7 @@ TerminatorInst *llvm::SplitBlockAndInsertIfThen(Value *Cond,
       std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end());
 
       DomTreeNode *NewNode = DT->addNewBlock(Tail, Head);
-      for (auto Child : Children)
+      for (DomTreeNode *Child : Children)
         DT->changeImmediateDominator(Child, NewNode);
 
       // Head dominates ThenBlock.
@@ -743,23 +645,15 @@ TerminatorInst *llvm::SplitBlockAndInsertIfThen(Value *Cond,
     }
   }
 
+  if (LI) {
+    Loop *L = LI->getLoopFor(Head);
+    L->addBasicBlockToLoop(ThenBlock, *LI);
+    L->addBasicBlockToLoop(Tail, *LI);
+  }
+
   return CheckTerm;
 }
 
-/// SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen,
-/// but also creates the ElseBlock.
-/// Before:
-///   Head
-///   SplitBefore
-///   Tail
-/// After:
-///   Head
-///   if (Cond)
-///     ThenBlock
-///   else
-///     ElseBlock
-///   SplitBefore
-///   Tail
 void llvm::SplitBlockAndInsertIfThenElse(Value *Cond, Instruction *SplitBefore,
                                          TerminatorInst **ThenTerm,
                                          TerminatorInst **ElseTerm,
@@ -781,15 +675,6 @@ void llvm::SplitBlockAndInsertIfThenElse(Value *Cond, Instruction *SplitBefore,
 }
 
 
-/// GetIfCondition - Given a basic block (BB) with two predecessors,
-/// check to see if the merge at this block is due
-/// to an "if condition".  If so, return the boolean condition that determines
-/// which entry into BB will be taken.  Also, return by references the block
-/// that will be entered from if the condition is true, and the block that will
-/// be entered if the condition is false.
-///
-/// This does no checking to see if the true/false blocks have large or unsavory
-/// instructions in them.
 Value *llvm::GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
                              BasicBlock *&IfFalse) {
   PHINode *SomePHI = dyn_cast<PHINode>(BB->begin());
diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 95825991cee96..49b646a041f50 100644
--- a/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -76,11 +76,10 @@ FunctionPass *llvm::createBreakCriticalEdgesPass() {
 //    Implementation of the external critical edge manipulation functions
 //===----------------------------------------------------------------------===//
 
-/// createPHIsForSplitLoopExit - When a loop exit edge is split, LCSSA form
-/// may require new PHIs in the new exit block. This function inserts the
-/// new PHIs, as needed. Preds is a list of preds inside the loop, SplitBB
-/// is the new loop exit block, and DestBB is the old loop exit, now the
-/// successor of SplitBB.
+/// When a loop exit edge is split, LCSSA form may require new PHIs in the new
+/// exit block. This function inserts the new PHIs, as needed. Preds is a list
+/// of preds inside the loop, SplitBB is the new loop exit block, and DestBB is
+/// the old loop exit, now the successor of SplitBB.
 static void createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds,
                                        BasicBlock *SplitBB,
                                        BasicBlock *DestBB) {
@@ -112,25 +111,9 @@ static void createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds,
   }
 }
 
-/// SplitCriticalEdge - If this edge is a critical edge, insert a new node to
-/// split the critical edge.  This will update DominatorTree information if it
-/// is available, thus calling this pass will not invalidate either of them.
-/// This returns the new block if the edge was split, null otherwise.
-///
-/// If MergeIdenticalEdges is true (not the default), *all* edges from TI to the
-/// specified successor will be merged into the same critical edge block.
-/// This is most commonly interesting with switch instructions, which may
-/// have many edges to any one destination.  This ensures that all edges to that
-/// dest go to one block instead of each going to a different block, but isn't
-/// the standard definition of a "critical edge".
-///
-/// It is invalid to call this function on a critical edge that starts at an
-/// IndirectBrInst.  Splitting these edges will almost always create an invalid
-/// program because the address of the new block won't be the one that is jumped
-/// to.
-///
-BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
-                                    const CriticalEdgeSplittingOptions &Options) {
+BasicBlock *
+llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
+                        const CriticalEdgeSplittingOptions &Options) {
   if (!isCriticalEdge(TI, SuccNum, Options.MergeIdenticalEdges))
     return nullptr;
 
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index 64b44a6b79194..f4260a9ff9804 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -25,81 +26,742 @@
 
 using namespace llvm;
 
-/// CastToCStr - Return V if it is an i8*, otherwise cast it to i8*.
-Value *llvm::CastToCStr(Value *V, IRBuilder<> &B) {
+#define DEBUG_TYPE "build-libcalls"
+
+//- Infer Attributes ---------------------------------------------------------//
+
+STATISTIC(NumReadNone, "Number of functions inferred as readnone");
+STATISTIC(NumReadOnly, "Number of functions inferred as readonly");
+STATISTIC(NumArgMemOnly, "Number of functions inferred as argmemonly");
+STATISTIC(NumNoUnwind, "Number of functions inferred as nounwind");
+STATISTIC(NumNoCapture, "Number of arguments inferred as nocapture");
+STATISTIC(NumReadOnlyArg, "Number of arguments inferred as readonly");
+STATISTIC(NumNoAlias, "Number of function returns inferred as noalias");
+STATISTIC(NumNonNull, "Number of function returns inferred as nonnull returns");
+
+static bool setDoesNotAccessMemory(Function &F) {
+  if (F.doesNotAccessMemory())
+    return false;
+  F.setDoesNotAccessMemory();
+  ++NumReadNone;
+  return true;
+}
+
+static bool setOnlyReadsMemory(Function &F) {
+  if (F.onlyReadsMemory())
+    return false;
+  F.setOnlyReadsMemory();
+  ++NumReadOnly;
+  return true;
+}
+
+static bool setOnlyAccessesArgMemory(Function &F) {
+  if (F.onlyAccessesArgMemory())
+    return false;
+  F.setOnlyAccessesArgMemory ();
+  ++NumArgMemOnly;
+  return true;
+}
+
+static bool setDoesNotThrow(Function &F) {
+  if (F.doesNotThrow())
+    return false;
+  F.setDoesNotThrow();
+  ++NumNoUnwind;
+  return true;
+}
+
+static bool setDoesNotCapture(Function &F, unsigned n) {
+  if (F.doesNotCapture(n))
+    return false;
+  F.setDoesNotCapture(n);
+  ++NumNoCapture;
+  return true;
+}
+
+static bool setOnlyReadsMemory(Function &F, unsigned n) {
+  if (F.onlyReadsMemory(n))
+    return false;
+  F.setOnlyReadsMemory(n);
+  ++NumReadOnlyArg;
+  return true;
+}
+
+static bool setDoesNotAlias(Function &F, unsigned n) {
+  if (F.doesNotAlias(n))
+    return false;
+  F.setDoesNotAlias(n);
+  ++NumNoAlias;
+  return true;
+}
+
+static bool setNonNull(Function &F, unsigned n) {
+  assert((n != AttributeSet::ReturnIndex ||
+          F.getReturnType()->isPointerTy()) &&
+         "nonnull applies only to pointers");
+  if (F.getAttributes().hasAttribute(n, Attribute::NonNull))
+    return false;
+  F.addAttribute(n, Attribute::NonNull);
+  ++NumNonNull;
+  return true;
+}
+
+bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
+  LibFunc::Func TheLibFunc;
+  if (!(TLI.getLibFunc(F, TheLibFunc) && TLI.has(TheLibFunc)))
+    return false;
+
+  bool Changed = false;
+  switch (TheLibFunc) {
+  case LibFunc::strlen:
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::strchr:
+  case LibFunc::strrchr:
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
+    return Changed;
+  case LibFunc::strtol:
+  case LibFunc::strtod:
+  case LibFunc::strtof:
+  case LibFunc::strtoul:
+  case LibFunc::strtoll:
+  case LibFunc::strtold:
+  case LibFunc::strtoull:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::strcpy:
+  case LibFunc::stpcpy:
+  case LibFunc::strcat:
+  case LibFunc::strncat:
+  case LibFunc::strncpy:
+  case LibFunc::stpncpy:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::strxfrm:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::strcmp:      // 0,1
+  case LibFunc::strspn:      // 0,1
+  case LibFunc::strncmp:     // 0,1
+  case LibFunc::strcspn:     // 0,1
+  case LibFunc::strcoll:     // 0,1
+  case LibFunc::strcasecmp:  // 0,1
+  case LibFunc::strncasecmp: //
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::strstr:
+  case LibFunc::strpbrk:
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::strtok:
+  case LibFunc::strtok_r:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::scanf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::setbuf:
+  case LibFunc::setvbuf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::strdup:
+  case LibFunc::strndup:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::stat:
+  case LibFunc::statvfs:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::sscanf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::sprintf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::snprintf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 3);
+    Changed |= setOnlyReadsMemory(F, 3);
+    return Changed;
+  case LibFunc::setitimer:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setDoesNotCapture(F, 3);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::system:
+    // May throw; "system" is a valid pthread cancellation point.
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::malloc:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    return Changed;
+  case LibFunc::memcmp:
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::memchr:
+  case LibFunc::memrchr:
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotThrow(F);
+    return Changed;
+  case LibFunc::modf:
+  case LibFunc::modff:
+  case LibFunc::modfl:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::memcpy:
+  case LibFunc::memccpy:
+  case LibFunc::memmove:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::memcpy_chk:
+    Changed |= setDoesNotThrow(F);
+    return Changed;
+  case LibFunc::memalign:
+    Changed |= setDoesNotAlias(F, 0);
+    return Changed;
+  case LibFunc::mkdir:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::mktime:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::realloc:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::read:
+    // May throw; "read" is a valid pthread cancellation point.
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::rewind:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::rmdir:
+  case LibFunc::remove:
+  case LibFunc::realpath:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::rename:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::readlink:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::write:
+    // May throw; "write" is a valid pthread cancellation point.
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::bcopy:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::bcmp:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::bzero:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::calloc:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    return Changed;
+  case LibFunc::chmod:
+  case LibFunc::chown:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::ctermid:
+  case LibFunc::clearerr:
+  case LibFunc::closedir:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::atoi:
+  case LibFunc::atol:
+  case LibFunc::atof:
+  case LibFunc::atoll:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::access:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::fopen:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::fdopen:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::feof:
+  case LibFunc::free:
+  case LibFunc::fseek:
+  case LibFunc::ftell:
+  case LibFunc::fgetc:
+  case LibFunc::fseeko:
+  case LibFunc::ftello:
+  case LibFunc::fileno:
+  case LibFunc::fflush:
+  case LibFunc::fclose:
+  case LibFunc::fsetpos:
+  case LibFunc::flockfile:
+  case LibFunc::funlockfile:
+  case LibFunc::ftrylockfile:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::ferror:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F);
+    return Changed;
+  case LibFunc::fputc:
+  case LibFunc::fstat:
+  case LibFunc::frexp:
+  case LibFunc::frexpf:
+  case LibFunc::frexpl:
+  case LibFunc::fstatvfs:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::fgets:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 3);
+    return Changed;
+  case LibFunc::fread:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 4);
+    return Changed;
+  case LibFunc::fwrite:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 4);
+    // FIXME: readonly #1?
+    return Changed;
+  case LibFunc::fputs:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::fscanf:
+  case LibFunc::fprintf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::fgetpos:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::getc:
+  case LibFunc::getlogin_r:
+  case LibFunc::getc_unlocked:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::getenv:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::gets:
+  case LibFunc::getchar:
+    Changed |= setDoesNotThrow(F);
+    return Changed;
+  case LibFunc::getitimer:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::getpwnam:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::ungetc:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::uname:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::unlink:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::unsetenv:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::utime:
+  case LibFunc::utimes:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::putc:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::puts:
+  case LibFunc::printf:
+  case LibFunc::perror:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::pread:
+    // May throw; "pread" is a valid pthread cancellation point.
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::pwrite:
+    // May throw; "pwrite" is a valid pthread cancellation point.
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::putchar:
+    Changed |= setDoesNotThrow(F);
+    return Changed;
+  case LibFunc::popen:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::pclose:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::vscanf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::vsscanf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::vfscanf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::valloc:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    return Changed;
+  case LibFunc::vprintf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::vfprintf:
+  case LibFunc::vsprintf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::vsnprintf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 3);
+    Changed |= setOnlyReadsMemory(F, 3);
+    return Changed;
+  case LibFunc::open:
+    // May throw; "open" is a valid pthread cancellation point.
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::opendir:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::tmpfile:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    return Changed;
+  case LibFunc::times:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::htonl:
+  case LibFunc::htons:
+  case LibFunc::ntohl:
+  case LibFunc::ntohs:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAccessMemory(F);
+    return Changed;
+  case LibFunc::lstat:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::lchown:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::qsort:
+    // May throw; places call through function pointer.
+    Changed |= setDoesNotCapture(F, 4);
+    return Changed;
+  case LibFunc::dunder_strdup:
+  case LibFunc::dunder_strndup:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::dunder_strtok_r:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::under_IO_getc:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::under_IO_putc:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::dunder_isoc99_scanf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::stat64:
+  case LibFunc::lstat64:
+  case LibFunc::statvfs64:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::dunder_isoc99_sscanf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::fopen64:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  case LibFunc::fseeko64:
+  case LibFunc::ftello64:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    return Changed;
+  case LibFunc::tmpfile64:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotAlias(F, 0);
+    return Changed;
+  case LibFunc::fstat64:
+  case LibFunc::fstatvfs64:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::open64:
+    // May throw; "open" is a valid pthread cancellation point.
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
+    return Changed;
+  case LibFunc::gettimeofday:
+    // Currently some platforms have the restrict keyword on the arguments to
+    // gettimeofday. To be conservative, do not add noalias to gettimeofday's
+    // arguments.
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    return Changed;
+  case LibFunc::Znwj: // new(unsigned int)
+  case LibFunc::Znwm: // new(unsigned long)
+  case LibFunc::Znaj: // new[](unsigned int)
+  case LibFunc::Znam: // new[](unsigned long)
+  case LibFunc::msvc_new_int: // new(unsigned int)
+  case LibFunc::msvc_new_longlong: // new(unsigned long long)
+  case LibFunc::msvc_new_array_int: // new[](unsigned int)
+  case LibFunc::msvc_new_array_longlong: // new[](unsigned long long)
+    // Operator new always returns a nonnull noalias pointer
+    Changed |= setNonNull(F, AttributeSet::ReturnIndex);
+    Changed |= setDoesNotAlias(F, AttributeSet::ReturnIndex);
+    return Changed;
+  //TODO: add LibFunc entries for:
+  //case LibFunc::memset_pattern4:
+  //case LibFunc::memset_pattern8:
+  case LibFunc::memset_pattern16:
+    Changed |= setOnlyAccessesArgMemory(F);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 2);
+    return Changed;
+  // int __nvvm_reflect(const char *)
+  case LibFunc::nvvm_reflect:
+    Changed |= setDoesNotAccessMemory(F);
+    Changed |= setDoesNotThrow(F);
+    return Changed;
+
+  default:
+    // FIXME: It'd be really nice to cover all the library functions we're
+    // aware of here.
+    return false;
+  }
+}
+
+//- Emit LibCalls ------------------------------------------------------------//
+
+Value *llvm::castToCStr(Value *V, IRBuilder<> &B) {
   unsigned AS = V->getType()->getPointerAddressSpace();
   return B.CreateBitCast(V, B.getInt8PtrTy(AS), "cstr");
 }
 
-/// EmitStrLen - Emit a call to the strlen function to the builder, for the
-/// specified pointer.  This always returns an integer value of size intptr_t.
-Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout &DL,
+Value *llvm::emitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout &DL,
                         const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::strlen))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeSet AS[2];
-  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
-  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
-  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs);
-
+  Module *M = B.GetInsertBlock()->getModule();
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Constant *StrLen = M->getOrInsertFunction(
-      "strlen", AttributeSet::get(M->getContext(), AS),
-      DL.getIntPtrType(Context), B.getInt8PtrTy(), nullptr);
-  CallInst *CI = B.CreateCall(StrLen, CastToCStr(Ptr, B), "strlen");
+  Constant *StrLen = M->getOrInsertFunction("strlen", DL.getIntPtrType(Context),
+                                            B.getInt8PtrTy(), nullptr);
+  inferLibFuncAttributes(*M->getFunction("strlen"), *TLI);
+  CallInst *CI = B.CreateCall(StrLen, castToCStr(Ptr, B), "strlen");
   if (const Function *F = dyn_cast<Function>(StrLen->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
 
   return CI;
 }
 
-/// EmitStrChr - Emit a call to the strchr function to the builder, for the
-/// specified pointer and character.  Ptr is required to be some pointer type,
-/// and the return value has 'i8*' type.
-Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B,
+Value *llvm::emitStrChr(Value *Ptr, char C, IRBuilder<> &B,
                         const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::strchr))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
-  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
-  AttributeSet AS =
-    AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs);
-
+  Module *M = B.GetInsertBlock()->getModule();
   Type *I8Ptr = B.getInt8PtrTy();
   Type *I32Ty = B.getInt32Ty();
-  Constant *StrChr = M->getOrInsertFunction("strchr",
-                                            AttributeSet::get(M->getContext(),
-                                                             AS),
-                                            I8Ptr, I8Ptr, I32Ty, nullptr);
+  Constant *StrChr =
+      M->getOrInsertFunction("strchr", I8Ptr, I8Ptr, I32Ty, nullptr);
+  inferLibFuncAttributes(*M->getFunction("strchr"), *TLI);
   CallInst *CI = B.CreateCall(
-      StrChr, {CastToCStr(Ptr, B), ConstantInt::get(I32Ty, C)}, "strchr");
+      StrChr, {castToCStr(Ptr, B), ConstantInt::get(I32Ty, C)}, "strchr");
   if (const Function *F = dyn_cast<Function>(StrChr->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
   return CI;
 }
 
-/// EmitStrNCmp - Emit a call to the strncmp function to the builder.
-Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
+Value *llvm::emitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
                          const DataLayout &DL, const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::strncmp))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeSet AS[3];
-  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
-  AS[1] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
-  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
-  AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs);
-
+  Module *M = B.GetInsertBlock()->getModule();
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *StrNCmp = M->getOrInsertFunction(
-      "strncmp", AttributeSet::get(M->getContext(), AS), B.getInt32Ty(),
-      B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context), nullptr);
+  Value *StrNCmp = M->getOrInsertFunction("strncmp", B.getInt32Ty(),
+                                          B.getInt8PtrTy(), B.getInt8PtrTy(),
+                                          DL.getIntPtrType(Context), nullptr);
+  inferLibFuncAttributes(*M->getFunction("strncmp"), *TLI);
   CallInst *CI = B.CreateCall(
-      StrNCmp, {CastToCStr(Ptr1, B), CastToCStr(Ptr2, B), Len}, "strncmp");
+      StrNCmp, {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, "strncmp");
 
   if (const Function *F = dyn_cast<Function>(StrNCmp->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -107,64 +769,46 @@ Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
   return CI;
 }
 
-/// EmitStrCpy - Emit a call to the strcpy function to the builder, for the
-/// specified pointer arguments.
-Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
+Value *llvm::emitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
                         const TargetLibraryInfo *TLI, StringRef Name) {
   if (!TLI->has(LibFunc::strcpy))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeSet AS[2];
-  AS[0] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
-  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
-                            Attribute::NoUnwind);
+  Module *M = B.GetInsertBlock()->getModule();
   Type *I8Ptr = B.getInt8PtrTy();
-  Value *StrCpy = M->getOrInsertFunction(Name,
-                                         AttributeSet::get(M->getContext(), AS),
-                                         I8Ptr, I8Ptr, I8Ptr, nullptr);
+  Value *StrCpy = M->getOrInsertFunction(Name, I8Ptr, I8Ptr, I8Ptr, nullptr);
+  inferLibFuncAttributes(*M->getFunction(Name), *TLI);
   CallInst *CI =
-      B.CreateCall(StrCpy, {CastToCStr(Dst, B), CastToCStr(Src, B)}, Name);
+      B.CreateCall(StrCpy, {castToCStr(Dst, B), castToCStr(Src, B)}, Name);
   if (const Function *F = dyn_cast<Function>(StrCpy->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
   return CI;
 }
 
-/// EmitStrNCpy - Emit a call to the strncpy function to the builder, for the
-/// specified pointer arguments.
-Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len, IRBuilder<> &B,
+Value *llvm::emitStrNCpy(Value *Dst, Value *Src, Value *Len, IRBuilder<> &B,
                          const TargetLibraryInfo *TLI, StringRef Name) {
   if (!TLI->has(LibFunc::strncpy))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeSet AS[2];
-  AS[0] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
-  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
-                            Attribute::NoUnwind);
+  Module *M = B.GetInsertBlock()->getModule();
   Type *I8Ptr = B.getInt8PtrTy();
-  Value *StrNCpy = M->getOrInsertFunction(Name,
-                                          AttributeSet::get(M->getContext(),
-                                                            AS),
-                                          I8Ptr, I8Ptr, I8Ptr,
+  Value *StrNCpy = M->getOrInsertFunction(Name, I8Ptr, I8Ptr, I8Ptr,
                                           Len->getType(), nullptr);
+  inferLibFuncAttributes(*M->getFunction(Name), *TLI);
   CallInst *CI = B.CreateCall(
-      StrNCpy, {CastToCStr(Dst, B), CastToCStr(Src, B), Len}, "strncpy");
+      StrNCpy, {castToCStr(Dst, B), castToCStr(Src, B), Len}, "strncpy");
   if (const Function *F = dyn_cast<Function>(StrNCpy->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
   return CI;
 }
 
-/// EmitMemCpyChk - Emit a call to the __memcpy_chk function to the builder.
-/// This expects that the Len and ObjSize have type 'intptr_t' and Dst/Src
-/// are pointers.
-Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
+Value *llvm::emitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
                            IRBuilder<> &B, const DataLayout &DL,
                            const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::memcpy_chk))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
+  Module *M = B.GetInsertBlock()->getModule();
   AttributeSet AS;
   AS = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
                          Attribute::NoUnwind);
@@ -173,30 +817,26 @@ Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
       "__memcpy_chk", AttributeSet::get(M->getContext(), AS), B.getInt8PtrTy(),
       B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context),
       DL.getIntPtrType(Context), nullptr);
-  Dst = CastToCStr(Dst, B);
-  Src = CastToCStr(Src, B);
+  Dst = castToCStr(Dst, B);
+  Src = castToCStr(Src, B);
   CallInst *CI = B.CreateCall(MemCpy, {Dst, Src, Len, ObjSize});
   if (const Function *F = dyn_cast<Function>(MemCpy->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
   return CI;
 }
 
-/// EmitMemChr - Emit a call to the memchr function.  This assumes that Ptr is
-/// a pointer, Val is an i32 value, and Len is an 'intptr_t' value.
-Value *llvm::EmitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilder<> &B,
+Value *llvm::emitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilder<> &B,
                         const DataLayout &DL, const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::memchr))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeSet AS;
-  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
-  AS = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs);
+  Module *M = B.GetInsertBlock()->getModule();
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *MemChr = M->getOrInsertFunction(
-      "memchr", AttributeSet::get(M->getContext(), AS), B.getInt8PtrTy(),
-      B.getInt8PtrTy(), B.getInt32Ty(), DL.getIntPtrType(Context), nullptr);
-  CallInst *CI = B.CreateCall(MemChr, {CastToCStr(Ptr, B), Val, Len}, "memchr");
+  Value *MemChr = M->getOrInsertFunction("memchr", B.getInt8PtrTy(),
+                                         B.getInt8PtrTy(), B.getInt32Ty(),
+                                         DL.getIntPtrType(Context), nullptr);
+  inferLibFuncAttributes(*M->getFunction("memchr"), *TLI);
+  CallInst *CI = B.CreateCall(MemChr, {castToCStr(Ptr, B), Val, Len}, "memchr");
 
   if (const Function *F = dyn_cast<Function>(MemChr->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -204,25 +844,19 @@ Value *llvm::EmitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilder<> &B,
   return CI;
 }
 
-/// EmitMemCmp - Emit a call to the memcmp function.
-Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
+Value *llvm::emitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
                         const DataLayout &DL, const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::memcmp))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeSet AS[3];
-  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
-  AS[1] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
-  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
-  AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs);
-
+  Module *M = B.GetInsertBlock()->getModule();
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *MemCmp = M->getOrInsertFunction(
-      "memcmp", AttributeSet::get(M->getContext(), AS), B.getInt32Ty(),
-      B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context), nullptr);
+  Value *MemCmp = M->getOrInsertFunction("memcmp", B.getInt32Ty(),
+                                         B.getInt8PtrTy(), B.getInt8PtrTy(),
+                                         DL.getIntPtrType(Context), nullptr);
+  inferLibFuncAttributes(*M->getFunction("memcmp"), *TLI);
   CallInst *CI = B.CreateCall(
-      MemCmp, {CastToCStr(Ptr1, B), CastToCStr(Ptr2, B), Len}, "memcmp");
+      MemCmp, {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, "memcmp");
 
   if (const Function *F = dyn_cast<Function>(MemCmp->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -231,7 +865,8 @@ Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
 }
 
 /// Append a suffix to the function name according to the type of 'Op'.
-static void AppendTypeSuffix(Value *Op, StringRef &Name, SmallString<20> &NameBuffer) {
+static void appendTypeSuffix(Value *Op, StringRef &Name,
+                             SmallString<20> &NameBuffer) {
   if (!Op->getType()->isDoubleTy()) {
       NameBuffer += Name;
 
@@ -242,19 +877,14 @@ static void AppendTypeSuffix(Value *Op, StringRef &Name, SmallString<20> &NameBu
 
     Name = NameBuffer;
   }  
-  return;
 }
 
-/// EmitUnaryFloatFnCall - Emit a call to the unary function named 'Name' (e.g.
-/// 'floor').  This function is known to take a single of type matching 'Op' and
-/// returns one value with the same type.  If 'Op' is a long double, 'l' is
-/// added as the suffix of name, if 'Op' is a float, we add a 'f' suffix.
-Value *llvm::EmitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
+Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
                                   const AttributeSet &Attrs) {
   SmallString<20> NameBuffer;
-  AppendTypeSuffix(Op, Name, NameBuffer);   
+  appendTypeSuffix(Op, Name, NameBuffer);
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
+  Module *M = B.GetInsertBlock()->getModule();
   Value *Callee = M->getOrInsertFunction(Name, Op->getType(),
                                          Op->getType(), nullptr);
   CallInst *CI = B.CreateCall(Callee, Op, Name);
@@ -265,19 +895,14 @@ Value *llvm::EmitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
   return CI;
 }
 
-/// EmitBinaryFloatFnCall - Emit a call to the binary function named 'Name'
-/// (e.g. 'fmin').  This function is known to take type matching 'Op1' and 'Op2'
-/// and return one value with the same type.  If 'Op1/Op2' are long double, 'l'
-/// is added as the suffix of name, if 'Op1/Op2' is a float, we add a 'f'
-/// suffix.
-Value *llvm::EmitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
+Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
                                   IRBuilder<> &B, const AttributeSet &Attrs) {
   SmallString<20> NameBuffer;
-  AppendTypeSuffix(Op1, Name, NameBuffer);   
+  appendTypeSuffix(Op1, Name, NameBuffer);
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
-  Value *Callee = M->getOrInsertFunction(Name, Op1->getType(),
-                                         Op1->getType(), Op2->getType(), nullptr);
+  Module *M = B.GetInsertBlock()->getModule();
+  Value *Callee = M->getOrInsertFunction(Name, Op1->getType(), Op1->getType(),
+                                         Op2->getType(), nullptr);
   CallInst *CI = B.CreateCall(Callee, {Op1, Op2}, Name);
   CI->setAttributes(Attrs);
   if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts()))
@@ -286,14 +911,12 @@ Value *llvm::EmitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
   return CI;
 }
 
-/// EmitPutChar - Emit a call to the putchar function.  This assumes that Char
-/// is an integer.
-Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B,
+Value *llvm::emitPutChar(Value *Char, IRBuilder<> &B,
                          const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::putchar))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
+  Module *M = B.GetInsertBlock()->getModule();
   Value *PutChar = M->getOrInsertFunction("putchar", B.getInt32Ty(),
                                           B.getInt32Ty(), nullptr);
   CallInst *CI = B.CreateCall(PutChar,
@@ -308,54 +931,31 @@ Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B,
   return CI;
 }
 
-/// EmitPutS - Emit a call to the puts function.  This assumes that Str is
-/// some pointer.
-Value *llvm::EmitPutS(Value *Str, IRBuilder<> &B,
+Value *llvm::emitPutS(Value *Str, IRBuilder<> &B,
                       const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::puts))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeSet AS[2];
-  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
-  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
-                            Attribute::NoUnwind);
-
-  Value *PutS = M->getOrInsertFunction("puts",
-                                       AttributeSet::get(M->getContext(), AS),
-                                       B.getInt32Ty(),
-                                       B.getInt8PtrTy(),
-                                       nullptr);
-  CallInst *CI = B.CreateCall(PutS, CastToCStr(Str, B), "puts");
+  Module *M = B.GetInsertBlock()->getModule();
+  Value *PutS =
+      M->getOrInsertFunction("puts", B.getInt32Ty(), B.getInt8PtrTy(), nullptr);
+  inferLibFuncAttributes(*M->getFunction("puts"), *TLI);
+  CallInst *CI = B.CreateCall(PutS, castToCStr(Str, B), "puts");
   if (const Function *F = dyn_cast<Function>(PutS->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
   return CI;
 }
 
-/// EmitFPutC - Emit a call to the fputc function.  This assumes that Char is
-/// an integer and File is a pointer to FILE.
-Value *llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B,
+Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilder<> &B,
                        const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::fputc))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeSet AS[2];
-  AS[0] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
-  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
-                            Attribute::NoUnwind);
-  Constant *F;
+  Module *M = B.GetInsertBlock()->getModule();
+  Constant *F = M->getOrInsertFunction("fputc", B.getInt32Ty(), B.getInt32Ty(),
+                                       File->getType(), nullptr);
   if (File->getType()->isPointerTy())
-    F = M->getOrInsertFunction("fputc",
-                               AttributeSet::get(M->getContext(), AS),
-                               B.getInt32Ty(),
-                               B.getInt32Ty(), File->getType(),
-                               nullptr);
-  else
-    F = M->getOrInsertFunction("fputc",
-                               B.getInt32Ty(),
-                               B.getInt32Ty(),
-                               File->getType(), nullptr);
+    inferLibFuncAttributes(*M->getFunction("fputc"), *TLI);
   Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/true,
                          "chari");
   CallInst *CI = B.CreateCall(F, {Char, File}, "fputc");
@@ -365,66 +965,40 @@ Value *llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B,
   return CI;
 }
 
-/// EmitFPutS - Emit a call to the puts function.  Str is required to be a
-/// pointer and File is a pointer to FILE.
-Value *llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B,
+Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilder<> &B,
                        const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::fputs))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeSet AS[3];
-  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
-  AS[1] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
-  AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
-                            Attribute::NoUnwind);
+  Module *M = B.GetInsertBlock()->getModule();
   StringRef FPutsName = TLI->getName(LibFunc::fputs);
-  Constant *F;
+  Constant *F = M->getOrInsertFunction(
+      FPutsName, B.getInt32Ty(), B.getInt8PtrTy(), File->getType(), nullptr);
   if (File->getType()->isPointerTy())
-    F = M->getOrInsertFunction(FPutsName,
-                               AttributeSet::get(M->getContext(), AS),
-                               B.getInt32Ty(),
-                               B.getInt8PtrTy(),
-                               File->getType(), nullptr);
-  else
-    F = M->getOrInsertFunction(FPutsName, B.getInt32Ty(),
-                               B.getInt8PtrTy(),
-                               File->getType(), nullptr);
-  CallInst *CI = B.CreateCall(F, {CastToCStr(Str, B), File}, "fputs");
+    inferLibFuncAttributes(*M->getFunction(FPutsName), *TLI);
+  CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, "fputs");
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
     CI->setCallingConv(Fn->getCallingConv());
   return CI;
 }
 
-/// EmitFWrite - Emit a call to the fwrite function.  This assumes that Ptr is
-/// a pointer, Size is an 'intptr_t', and File is a pointer to FILE.
-Value *llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B,
+Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B,
                         const DataLayout &DL, const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::fwrite))
     return nullptr;
 
-  Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeSet AS[3];
-  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
-  AS[1] = AttributeSet::get(M->getContext(), 4, Attribute::NoCapture);
-  AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
-                            Attribute::NoUnwind);
+  Module *M = B.GetInsertBlock()->getModule();
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   StringRef FWriteName = TLI->getName(LibFunc::fwrite);
-  Constant *F;
+  Constant *F = M->getOrInsertFunction(
+      FWriteName, DL.getIntPtrType(Context), B.getInt8PtrTy(),
+      DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType(),
+      nullptr);
   if (File->getType()->isPointerTy())
-    F = M->getOrInsertFunction(
-        FWriteName, AttributeSet::get(M->getContext(), AS),
-        DL.getIntPtrType(Context), B.getInt8PtrTy(), DL.getIntPtrType(Context),
-        DL.getIntPtrType(Context), File->getType(), nullptr);
-  else
-    F = M->getOrInsertFunction(FWriteName, DL.getIntPtrType(Context),
-                               B.getInt8PtrTy(), DL.getIntPtrType(Context),
-                               DL.getIntPtrType(Context), File->getType(),
-                               nullptr);
+    inferLibFuncAttributes(*M->getFunction(FWriteName), *TLI);
   CallInst *CI =
-      B.CreateCall(F, {CastToCStr(Ptr, B), Size,
+      B.CreateCall(F, {castToCStr(Ptr, B), Size,
                        ConstantInt::get(DL.getIntPtrType(Context), 1), File});
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
index 8308a9b69149d..5aec0dce34db8 100644
--- a/lib/Transforms/Utils/CMakeLists.txt
+++ b/lib/Transforms/Utils/CMakeLists.txt
@@ -11,7 +11,9 @@ add_llvm_library(LLVMTransformUtils
   CodeExtractor.cpp
   CtorUtils.cpp
   DemoteRegToStack.cpp
+  Evaluator.cpp
   FlattenCFG.cpp
+  FunctionImportUtils.cpp
   GlobalStatus.cpp
   InlineFunction.cpp
   InstructionNamer.cpp
@@ -26,10 +28,13 @@ add_llvm_library(LLVMTransformUtils
   LowerInvoke.cpp
   LowerSwitch.cpp
   Mem2Reg.cpp
+  MemorySSA.cpp
   MetaRenamer.cpp
   ModuleUtils.cpp
+  NameAnonFunctions.cpp
   PromoteMemoryToRegister.cpp
   SSAUpdater.cpp
+  SanitizerStats.cpp
   SimplifyCFG.cpp
   SimplifyIndVar.cpp
   SimplifyInstructions.cpp
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index 6454afb8bc42d..c5ca56360fc88 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -119,6 +119,15 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
           .addAttributes(NewFunc->getContext(), AttributeSet::FunctionIndex,
                          OldAttrs.getFnAttributes()));
 
+  SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+  OldFunc->getAllMetadata(MDs);
+  for (auto MD : MDs)
+    NewFunc->addMetadata(
+        MD.first,
+        *MapMetadata(MD.second, VMap,
+                     ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+                     TypeMapper, Materializer));
+
   // Loop over all of the basic blocks in the function, cloning them as
   // appropriate.  Note that we save BE this way in order to handle cloning of
   // recursive functions into themselves.
@@ -163,65 +172,14 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
                        TypeMapper, Materializer);
 }
 
-// Find the MDNode which corresponds to the subprogram data that described F.
-static DISubprogram *FindSubprogram(const Function *F,
-                                    DebugInfoFinder &Finder) {
-  for (DISubprogram *Subprogram : Finder.subprograms()) {
-    if (Subprogram->describes(F))
-      return Subprogram;
-  }
-  return nullptr;
-}
-
-// Add an operand to an existing MDNode. The new operand will be added at the
-// back of the operand list.
-static void AddOperand(DICompileUnit *CU, DISubprogramArray SPs,
-                       Metadata *NewSP) {
-  SmallVector<Metadata *, 16> NewSPs;
-  NewSPs.reserve(SPs.size() + 1);
-  for (auto *SP : SPs)
-    NewSPs.push_back(SP);
-  NewSPs.push_back(NewSP);
-  CU->replaceSubprograms(MDTuple::get(CU->getContext(), NewSPs));
-}
-
-// Clone the module-level debug info associated with OldFunc. The cloned data
-// will point to NewFunc instead.
-static void CloneDebugInfoMetadata(Function *NewFunc, const Function *OldFunc,
-                            ValueToValueMapTy &VMap) {
-  DebugInfoFinder Finder;
-  Finder.processModule(*OldFunc->getParent());
-
-  const DISubprogram *OldSubprogramMDNode = FindSubprogram(OldFunc, Finder);
-  if (!OldSubprogramMDNode) return;
-
-  auto *NewSubprogram =
-      cast<DISubprogram>(MapMetadata(OldSubprogramMDNode, VMap));
-  NewFunc->setSubprogram(NewSubprogram);
-
-  for (auto *CU : Finder.compile_units()) {
-    auto Subprograms = CU->getSubprograms();
-    // If the compile unit's function list contains the old function, it should
-    // also contain the new one.
-    for (auto *SP : Subprograms) {
-      if (SP == OldSubprogramMDNode) {
-        AddOperand(CU, Subprograms, NewSubprogram);
-        break;
-      }
-    }
-  }
-}
-
-/// Return a copy of the specified function, but without
-/// embedding the function into another module.  Also, any references specified
-/// in the VMap are changed to refer to their mapped value instead of the
-/// original one.  If any of the arguments to the function are in the VMap,
-/// the arguments are deleted from the resultant function.  The VMap is
-/// updated to include mappings from all of the instructions and basicblocks in
-/// the function from their old to new values.
+/// Return a copy of the specified function and add it to that function's
+/// module.  Also, any references specified in the VMap are changed to refer to
+/// their mapped value instead of the original one.  If any of the arguments to
+/// the function are in the VMap, the arguments are deleted from the resultant
+/// function.  The VMap is updated to include mappings from all of the
+/// instructions and basicblocks in the function from their old to new values.
 ///
-Function *llvm::CloneFunction(const Function *F, ValueToValueMapTy &VMap,
-                              bool ModuleLevelChanges,
+Function *llvm::CloneFunction(Function *F, ValueToValueMapTy &VMap,
                               ClonedCodeInfo *CodeInfo) {
   std::vector<Type*> ArgTypes;
 
@@ -237,7 +195,8 @@ Function *llvm::CloneFunction(const Function *F, ValueToValueMapTy &VMap,
                                     ArgTypes, F->getFunctionType()->isVarArg());
 
   // Create the new function...
-  Function *NewF = Function::Create(FTy, F->getLinkage(), F->getName());
+  Function *NewF =
+      Function::Create(FTy, F->getLinkage(), F->getName(), F->getParent());
 
   // Loop over the arguments, copying the names of the mapped arguments over...
   Function::arg_iterator DestI = NewF->arg_begin();
@@ -247,11 +206,10 @@ Function *llvm::CloneFunction(const Function *F, ValueToValueMapTy &VMap,
       VMap[&I] = &*DestI++;        // Add mapping to VMap
     }
 
-  if (ModuleLevelChanges)
-    CloneDebugInfoMetadata(NewF, F, VMap);
-
   SmallVector<ReturnInst*, 8> Returns;  // Ignore returns cloned.
-  CloneFunctionInto(NewF, F, VMap, ModuleLevelChanges, Returns, "", CodeInfo);
+  CloneFunctionInto(NewF, F, VMap, /*ModuleLevelChanges=*/false, Returns, "",
+                    CodeInfo);
+
   return NewF;
 }
 
@@ -338,9 +296,11 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
         if (Value *MappedV = VMap.lookup(V))
           V = MappedV;
 
-        VMap[&*II] = V;
-        delete NewInst;
-        continue;
+        if (!NewInst->mayHaveSideEffects()) {
+          VMap[&*II] = V;
+          delete NewInst;
+          continue;
+        }
       }
     }
 
@@ -372,7 +332,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
       ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
       // Or is a known constant in the caller...
       if (!Cond) {
-        Value *V = VMap[BI->getCondition()];
+        Value *V = VMap.lookup(BI->getCondition());
         Cond = dyn_cast_or_null<ConstantInt>(V);
       }
 
@@ -388,7 +348,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
     // If switching on a value known constant in the caller.
     ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition());
     if (!Cond) { // Or known constant after constant prop in the callee...
-      Value *V = VMap[SI->getCondition()];
+      Value *V = VMap.lookup(SI->getCondition());
       Cond = dyn_cast_or_null<ConstantInt>(V);
     }
     if (Cond) {     // Constant fold to uncond branch!
@@ -475,7 +435,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
   // Defer PHI resolution until rest of function is resolved.
   SmallVector<const PHINode*, 16> PHIToResolve;
   for (const BasicBlock &BI : *OldFunc) {
-    Value *V = VMap[&BI];
+    Value *V = VMap.lookup(&BI);
     BasicBlock *NewBB = cast_or_null<BasicBlock>(V);
     if (!NewBB) continue;  // Dead block.
 
@@ -519,7 +479,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
       OPN = PHIToResolve[phino];
       PHINode *PN = cast<PHINode>(VMap[OPN]);
       for (unsigned pred = 0, e = NumPreds; pred != e; ++pred) {
-        Value *V = VMap[PN->getIncomingBlock(pred)];
+        Value *V = VMap.lookup(PN->getIncomingBlock(pred));
         if (BasicBlock *MappedBlock = cast_or_null<BasicBlock>(V)) {
           Value *InVal = MapValue(PN->getIncomingValue(pred),
                                   VMap, 
@@ -529,7 +489,8 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
           PN->setIncomingBlock(pred, MappedBlock);
         } else {
           PN->removeIncomingValue(pred, false);
-          --pred, --e;  // Revisit the next entry.
+          --pred;  // Revisit the next entry.
+          --e;
         }
       } 
     }
@@ -558,10 +519,9 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
       // entries.
       BasicBlock::iterator I = NewBB->begin();
       for (; (PN = dyn_cast<PHINode>(I)); ++I) {
-        for (std::map<BasicBlock*, unsigned>::iterator PCI =PredCount.begin(),
-             E = PredCount.end(); PCI != E; ++PCI) {
-          BasicBlock *Pred     = PCI->first;
-          for (unsigned NumToRemove = PCI->second; NumToRemove; --NumToRemove)
+        for (const auto &PCI : PredCount) {
+          BasicBlock *Pred = PCI.first;
+          for (unsigned NumToRemove = PCI.second; NumToRemove; --NumToRemove)
             PN->removeIncomingValue(Pred, false);
         }
       }
@@ -684,7 +644,7 @@ void llvm::remapInstructionsInBlocks(
   for (auto *BB : Blocks)
     for (auto &Inst : *BB)
       RemapInstruction(&Inst, VMap,
-                       RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
 }
 
 /// \brief Clones a loop \p OrigLoop.  Returns the loop and the blocks in \p
@@ -697,6 +657,8 @@ Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,
                                    const Twine &NameSuffix, LoopInfo *LI,
                                    DominatorTree *DT,
                                    SmallVectorImpl<BasicBlock *> &Blocks) {
+  assert(OrigLoop->getSubLoops().empty() && 
+         "Loop to be cloned cannot have inner loop");
   Function *F = OrigLoop->getHeader()->getParent();
   Loop *ParentLoop = OrigLoop->getParentLoop();
 
@@ -727,13 +689,19 @@ Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,
     // Update LoopInfo.
     NewLoop->addBasicBlockToLoop(NewBB, *LI);
 
-    // Update DominatorTree.
-    BasicBlock *IDomBB = DT->getNode(BB)->getIDom()->getBlock();
-    DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDomBB]));
+    // Add DominatorTree node. After seeing all blocks, update to correct IDom.
+    DT->addNewBlock(NewBB, NewPH);
 
     Blocks.push_back(NewBB);
   }
 
+  for (BasicBlock *BB : OrigLoop->getBlocks()) {
+    // Update DominatorTree.
+    BasicBlock *IDomBB = DT->getNode(BB)->getIDom()->getBlock();
+    DT->changeImmediateDominator(cast<BasicBlock>(VMap[BB]),
+                                 cast<BasicBlock>(VMap[IDomBB]));
+  }
+
   // Move them physically from the end of the block list.
   F->getBasicBlockList().splice(Before->getIterator(), F->getBasicBlockList(),
                                 NewPH);
diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp
index ab083353ece6b..17e34c4ffa0f5 100644
--- a/lib/Transforms/Utils/CloneModule.cpp
+++ b/lib/Transforms/Utils/CloneModule.cpp
@@ -38,7 +38,7 @@ std::unique_ptr<Module> llvm::CloneModule(const Module *M,
 
 std::unique_ptr<Module> llvm::CloneModule(
     const Module *M, ValueToValueMapTy &VMap,
-    std::function<bool(const GlobalValue *)> ShouldCloneDefinition) {
+    function_ref<bool(const GlobalValue *)> ShouldCloneDefinition) {
   // First off, we need to create the new module.
   std::unique_ptr<Module> New =
       llvm::make_unique<Module>(M->getModuleIdentifier(), M->getContext());
@@ -53,7 +53,7 @@ std::unique_ptr<Module> llvm::CloneModule(
   for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
        I != E; ++I) {
     GlobalVariable *GV = new GlobalVariable(*New, 
-                                            I->getType()->getElementType(),
+                                            I->getValueType(),
                                             I->isConstant(), I->getLinkage(),
                                             (Constant*) nullptr, I->getName(),
                                             (GlobalVariable*) nullptr,
@@ -64,12 +64,11 @@ std::unique_ptr<Module> llvm::CloneModule(
   }
 
   // Loop over the functions in the module, making external functions as before
-  for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) {
-    Function *NF =
-        Function::Create(cast<FunctionType>(I->getType()->getElementType()),
-                         I->getLinkage(), I->getName(), New.get());
-    NF->copyAttributesFrom(&*I);
-    VMap[&*I] = NF;
+  for (const Function &I : *M) {
+    Function *NF = Function::Create(cast<FunctionType>(I.getValueType()),
+                                    I.getLinkage(), I.getName(), New.get());
+    NF->copyAttributesFrom(&I);
+    VMap[&I] = NF;
   }
 
   // Loop over the aliases in the module
@@ -109,6 +108,9 @@ std::unique_ptr<Module> llvm::CloneModule(
   //
   for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
        I != E; ++I) {
+    if (I->isDeclaration())
+      continue;
+
     GlobalVariable *GV = cast<GlobalVariable>(VMap[&*I]);
     if (!ShouldCloneDefinition(&*I)) {
       // Skip after setting the correct linkage for an external reference.
@@ -121,27 +123,31 @@ std::unique_ptr<Module> llvm::CloneModule(
 
   // Similarly, copy over function bodies now...
   //
-  for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) {
-    Function *F = cast<Function>(VMap[&*I]);
-    if (!ShouldCloneDefinition(&*I)) {
+  for (const Function &I : *M) {
+    if (I.isDeclaration())
+      continue;
+
+    Function *F = cast<Function>(VMap[&I]);
+    if (!ShouldCloneDefinition(&I)) {
       // Skip after setting the correct linkage for an external reference.
       F->setLinkage(GlobalValue::ExternalLinkage);
+      // Personality function is not valid on a declaration.
+      F->setPersonalityFn(nullptr);
       continue;
     }
-    if (!I->isDeclaration()) {
-      Function::arg_iterator DestI = F->arg_begin();
-      for (Function::const_arg_iterator J = I->arg_begin(); J != I->arg_end();
-           ++J) {
-        DestI->setName(J->getName());
-        VMap[&*J] = &*DestI++;
-      }
-
-      SmallVector<ReturnInst*, 8> Returns;  // Ignore returns cloned.
-      CloneFunctionInto(F, &*I, VMap, /*ModuleLevelChanges=*/true, Returns);
+
+    Function::arg_iterator DestI = F->arg_begin();
+    for (Function::const_arg_iterator J = I.arg_begin(); J != I.arg_end();
+         ++J) {
+      DestI->setName(J->getName());
+      VMap[&*J] = &*DestI++;
     }
 
-    if (I->hasPersonalityFn())
-      F->setPersonalityFn(MapValue(I->getPersonalityFn(), VMap));
+    SmallVector<ReturnInst *, 8> Returns; // Ignore returns cloned.
+    CloneFunctionInto(F, &I, VMap, /*ModuleLevelChanges=*/true, Returns);
+
+    if (I.hasPersonalityFn())
+      F->setPersonalityFn(MapValue(I.getPersonalityFn(), VMap));
   }
 
   // And aliases
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index 823696d88e652..9f2181f87cee1 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -77,15 +77,15 @@ static SetVector<BasicBlock *> buildExtractionBlockSet(IteratorT BBBegin,
 
   // Loop over the blocks, adding them to our set-vector, and aborting with an
   // empty set if we encounter invalid blocks.
-  for (IteratorT I = BBBegin, E = BBEnd; I != E; ++I) {
-    if (!Result.insert(*I))
+  do {
+    if (!Result.insert(*BBBegin))
       llvm_unreachable("Repeated basic blocks in extraction input");
 
-    if (!isBlockValidForExtraction(**I)) {
+    if (!isBlockValidForExtraction(**BBBegin)) {
       Result.clear();
       return Result;
     }
-  }
+  } while (++BBBegin != BBEnd);
 
 #ifndef NDEBUG
   for (SetVector<BasicBlock *>::iterator I = std::next(Result.begin()),
@@ -159,23 +159,18 @@ static bool definedInCaller(const SetVector<BasicBlock *> &Blocks, Value *V) {
 
 void CodeExtractor::findInputsOutputs(ValueSet &Inputs,
                                       ValueSet &Outputs) const {
-  for (SetVector<BasicBlock *>::const_iterator I = Blocks.begin(),
-                                               E = Blocks.end();
-       I != E; ++I) {
-    BasicBlock *BB = *I;
-
+  for (BasicBlock *BB : Blocks) {
     // If a used value is defined outside the region, it's an input.  If an
     // instruction is used outside the region, it's an output.
-    for (BasicBlock::iterator II = BB->begin(), IE = BB->end();
-         II != IE; ++II) {
-      for (User::op_iterator OI = II->op_begin(), OE = II->op_end();
-           OI != OE; ++OI)
+    for (Instruction &II : *BB) {
+      for (User::op_iterator OI = II.op_begin(), OE = II.op_end(); OI != OE;
+           ++OI)
         if (definedInCaller(Blocks, *OI))
           Inputs.insert(*OI);
 
-      for (User *U : II->users())
+      for (User *U : II.users())
         if (!definedInRegion(Blocks, U)) {
-          Outputs.insert(&*II);
+          Outputs.insert(&II);
           break;
         }
     }
@@ -263,25 +258,21 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) {
 }
 
 void CodeExtractor::splitReturnBlocks() {
-  for (SetVector<BasicBlock *>::iterator I = Blocks.begin(), E = Blocks.end();
-       I != E; ++I)
-    if (ReturnInst *RI = dyn_cast<ReturnInst>((*I)->getTerminator())) {
+  for (BasicBlock *Block : Blocks)
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(Block->getTerminator())) {
       BasicBlock *New =
-          (*I)->splitBasicBlock(RI->getIterator(), (*I)->getName() + ".ret");
+          Block->splitBasicBlock(RI->getIterator(), Block->getName() + ".ret");
       if (DT) {
         // Old dominates New. New node dominates all other nodes dominated
         // by Old.
-        DomTreeNode *OldNode = DT->getNode(*I);
-        SmallVector<DomTreeNode*, 8> Children;
-        for (DomTreeNode::iterator DI = OldNode->begin(), DE = OldNode->end();
-             DI != DE; ++DI) 
-          Children.push_back(*DI);
+        DomTreeNode *OldNode = DT->getNode(Block);
+        SmallVector<DomTreeNode *, 8> Children(OldNode->begin(),
+                                               OldNode->end());
 
-        DomTreeNode *NewNode = DT->addNewBlock(New, *I);
+        DomTreeNode *NewNode = DT->addNewBlock(New, Block);
 
-        for (SmallVectorImpl<DomTreeNode *>::iterator I = Children.begin(),
-               E = Children.end(); I != E; ++I)
-          DT->changeImmediateDominator(*I, NewNode);
+        for (DomTreeNode *I : Children)
+          DT->changeImmediateDominator(I, NewNode);
       }
     }
 }
@@ -310,28 +301,26 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
   std::vector<Type*> paramTy;
 
   // Add the types of the input values to the function's argument list
-  for (ValueSet::const_iterator i = inputs.begin(), e = inputs.end();
-       i != e; ++i) {
-    const Value *value = *i;
+  for (Value *value : inputs) {
     DEBUG(dbgs() << "value used in func: " << *value << "\n");
     paramTy.push_back(value->getType());
   }
 
   // Add the types of the output values to the function's argument list.
-  for (ValueSet::const_iterator I = outputs.begin(), E = outputs.end();
-       I != E; ++I) {
-    DEBUG(dbgs() << "instr used in func: " << **I << "\n");
+  for (Value *output : outputs) {
+    DEBUG(dbgs() << "instr used in func: " << *output << "\n");
     if (AggregateArgs)
-      paramTy.push_back((*I)->getType());
+      paramTy.push_back(output->getType());
     else
-      paramTy.push_back(PointerType::getUnqual((*I)->getType()));
+      paramTy.push_back(PointerType::getUnqual(output->getType()));
   }
 
-  DEBUG(dbgs() << "Function type: " << *RetTy << " f(");
-  for (std::vector<Type*>::iterator i = paramTy.begin(),
-         e = paramTy.end(); i != e; ++i)
-    DEBUG(dbgs() << **i << ", ");
-  DEBUG(dbgs() << ")\n");
+  DEBUG({
+    dbgs() << "Function type: " << *RetTy << " f(";
+    for (Type *i : paramTy)
+      dbgs() << *i << ", ";
+    dbgs() << ")\n";
+  });
 
   StructType *StructTy;
   if (AggregateArgs && (inputs.size() + outputs.size() > 0)) {
@@ -372,9 +361,8 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
       RewriteVal = &*AI++;
 
     std::vector<User*> Users(inputs[i]->user_begin(), inputs[i]->user_end());
-    for (std::vector<User*>::iterator use = Users.begin(), useE = Users.end();
-         use != useE; ++use)
-      if (Instruction* inst = dyn_cast<Instruction>(*use))
+    for (User *use : Users)
+      if (Instruction *inst = dyn_cast<Instruction>(use))
         if (Blocks.count(inst->getParent()))
           inst->replaceUsesOfWith(inputs[i], RewriteVal);
   }
@@ -429,19 +417,19 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
   LLVMContext &Context = newFunction->getContext();
 
   // Add inputs as params, or to be filled into the struct
-  for (ValueSet::iterator i = inputs.begin(), e = inputs.end(); i != e; ++i)
+  for (Value *input : inputs)
     if (AggregateArgs)
-      StructValues.push_back(*i);
+      StructValues.push_back(input);
     else
-      params.push_back(*i);
+      params.push_back(input);
 
   // Create allocas for the outputs
-  for (ValueSet::iterator i = outputs.begin(), e = outputs.end(); i != e; ++i) {
+  for (Value *output : outputs) {
     if (AggregateArgs) {
-      StructValues.push_back(*i);
+      StructValues.push_back(output);
     } else {
       AllocaInst *alloca =
-          new AllocaInst((*i)->getType(), nullptr, (*i)->getName() + ".loc",
+          new AllocaInst(output->getType(), nullptr, output->getName() + ".loc",
                          &codeReplacer->getParent()->front().front());
       ReloadOutputs.push_back(alloca);
       params.push_back(alloca);
@@ -522,9 +510,8 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
   std::map<BasicBlock*, BasicBlock*> ExitBlockMap;
 
   unsigned switchVal = 0;
-  for (SetVector<BasicBlock*>::const_iterator i = Blocks.begin(),
-         e = Blocks.end(); i != e; ++i) {
-    TerminatorInst *TI = (*i)->getTerminator();
+  for (BasicBlock *Block : Blocks) {
+    TerminatorInst *TI = Block->getTerminator();
     for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
       if (!Blocks.count(TI->getSuccessor(i))) {
         BasicBlock *OldTarget = TI->getSuccessor(i);
@@ -576,10 +563,9 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
               // Make sure we are looking at the original successor block, not
               // at a newly inserted exit block, which won't be in the dominator
               // info.
-              for (std::map<BasicBlock*, BasicBlock*>::iterator I =
-                     ExitBlockMap.begin(), E = ExitBlockMap.end(); I != E; ++I)
-                if (DefBlock == I->second) {
-                  DefBlock = I->first;
+              for (const auto &I : ExitBlockMap)
+                if (DefBlock == I.second) {
+                  DefBlock = I.first;
                   break;
                 }
 
@@ -677,13 +663,12 @@ void CodeExtractor::moveCodeToFunction(Function *newFunction) {
   Function::BasicBlockListType &oldBlocks = oldFunc->getBasicBlockList();
   Function::BasicBlockListType &newBlocks = newFunction->getBasicBlockList();
 
-  for (SetVector<BasicBlock*>::const_iterator i = Blocks.begin(),
-         e = Blocks.end(); i != e; ++i) {
+  for (BasicBlock *Block : Blocks) {
     // Delete the basic block from the old function, and the list of blocks
-    oldBlocks.remove(*i);
+    oldBlocks.remove(Block);
 
     // Insert this basic block into the new function
-    newBlocks.push_back(*i);
+    newBlocks.push_back(Block);
   }
 }
 
@@ -721,9 +706,9 @@ Function *CodeExtractor::extractCodeRegion() {
   findInputsOutputs(inputs, outputs);
 
   SmallPtrSet<BasicBlock *, 1> ExitBlocks;
-  for (SetVector<BasicBlock *>::iterator I = Blocks.begin(), E = Blocks.end();
-       I != E; ++I)
-    for (succ_iterator SI = succ_begin(*I), SE = succ_end(*I); SI != SE; ++SI)
+  for (BasicBlock *Block : Blocks)
+    for (succ_iterator SI = succ_begin(Block), SE = succ_end(Block); SI != SE;
+         ++SI)
       if (!Blocks.count(*SI))
         ExitBlocks.insert(*SI);
   NumExitBlocks = ExitBlocks.size();
diff --git a/lib/Transforms/Utils/Evaluator.cpp b/lib/Transforms/Utils/Evaluator.cpp
new file mode 100644
index 0000000000000..cd130abf45192
--- /dev/null
+++ b/lib/Transforms/Utils/Evaluator.cpp
@@ -0,0 +1,596 @@
+//===- Evaluator.cpp - LLVM IR evaluator ----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Function evaluator for LLVM IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Evaluator.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "evaluator"
+
+using namespace llvm;
+
+static inline bool
+isSimpleEnoughValueToCommit(Constant *C,
+                            SmallPtrSetImpl<Constant *> &SimpleConstants,
+                            const DataLayout &DL);
+
+/// Return true if the specified constant can be handled by the code generator.
+/// We don't want to generate something like:
+///   void *X = &X/42;
+/// because the code generator doesn't have a relocation that can handle that.
+///
+/// This function should be called if C was not found (but just got inserted)
+/// in SimpleConstants to avoid having to rescan the same constants all the
+/// time.
+static bool
+isSimpleEnoughValueToCommitHelper(Constant *C,
+                                  SmallPtrSetImpl<Constant *> &SimpleConstants,
+                                  const DataLayout &DL) {
+  // Simple global addresses are supported, do not allow dllimport or
+  // thread-local globals.
+  if (auto *GV = dyn_cast<GlobalValue>(C))
+    return !GV->hasDLLImportStorageClass() && !GV->isThreadLocal();
+
+  // Simple integer, undef, constant aggregate zero, etc are all supported.
+  if (C->getNumOperands() == 0 || isa<BlockAddress>(C))
+    return true;
+
+  // Aggregate values are safe if all their elements are.
+  if (isa<ConstantAggregate>(C)) {
+    for (Value *Op : C->operands())
+      if (!isSimpleEnoughValueToCommit(cast<Constant>(Op), SimpleConstants, DL))
+        return false;
+    return true;
+  }
+
+  // We don't know exactly what relocations are allowed in constant expressions,
+  // so we allow &global+constantoffset, which is safe and uniformly supported
+  // across targets.
+  ConstantExpr *CE = cast<ConstantExpr>(C);
+  switch (CE->getOpcode()) {
+  case Instruction::BitCast:
+    // Bitcast is fine if the casted value is fine.
+    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
+
+  case Instruction::IntToPtr:
+  case Instruction::PtrToInt:
+    // int <=> ptr is fine if the int type is the same size as the
+    // pointer type.
+    if (DL.getTypeSizeInBits(CE->getType()) !=
+        DL.getTypeSizeInBits(CE->getOperand(0)->getType()))
+      return false;
+    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
+
+  // GEP is fine if it is simple + constant offset.
+  case Instruction::GetElementPtr:
+    for (unsigned i = 1, e = CE->getNumOperands(); i != e; ++i)
+      if (!isa<ConstantInt>(CE->getOperand(i)))
+        return false;
+    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
+
+  case Instruction::Add:
+    // We allow simple+cst.
+    if (!isa<ConstantInt>(CE->getOperand(1)))
+      return false;
+    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
+  }
+  return false;
+}
+
+static inline bool
+isSimpleEnoughValueToCommit(Constant *C,
+                            SmallPtrSetImpl<Constant *> &SimpleConstants,
+                            const DataLayout &DL) {
+  // If we already checked this constant, we win.
+  if (!SimpleConstants.insert(C).second)
+    return true;
+  // Check the constant.
+  return isSimpleEnoughValueToCommitHelper(C, SimpleConstants, DL);
+}
+
+/// Return true if this constant is simple enough for us to understand.  In
+/// particular, if it is a cast to anything other than from one pointer type to
+/// another pointer type, we punt.  We basically just support direct accesses to
+/// globals and GEP's of globals.  This should be kept up to date with
+/// CommitValueTo.
+static bool isSimpleEnoughPointerToCommit(Constant *C) {
+  // Conservatively, avoid aggregate types. This is because we don't
+  // want to worry about them partially overlapping other stores.
+  if (!cast<PointerType>(C->getType())->getElementType()->isSingleValueType())
+    return false;
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C))
+    // Do not allow weak/*_odr/linkonce linkage or external globals.
+    return GV->hasUniqueInitializer();
+
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+    // Handle a constantexpr gep.
+    if (CE->getOpcode() == Instruction::GetElementPtr &&
+        isa<GlobalVariable>(CE->getOperand(0)) &&
+        cast<GEPOperator>(CE)->isInBounds()) {
+      GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
+      // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or
+      // external globals.
+      if (!GV->hasUniqueInitializer())
+        return false;
+
+      // The first index must be zero.
+      ConstantInt *CI = dyn_cast<ConstantInt>(*std::next(CE->op_begin()));
+      if (!CI || !CI->isZero()) return false;
+
+      // The remaining indices must be compile-time known integers within the
+      // notional bounds of the corresponding static array types.
+      if (!CE->isGEPWithNoNotionalOverIndexing())
+        return false;
+
+      return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE);
+
+    // A constantexpr bitcast from a pointer to another pointer is a no-op,
+    // and we know how to evaluate it by moving the bitcast from the pointer
+    // operand to the value operand.
+    } else if (CE->getOpcode() == Instruction::BitCast &&
+               isa<GlobalVariable>(CE->getOperand(0))) {
+      // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or
+      // external globals.
+      return cast<GlobalVariable>(CE->getOperand(0))->hasUniqueInitializer();
+    }
+  }
+
+  return false;
+}
+
+/// Return the value that would be computed by a load from P after the stores
+/// reflected by 'memory' have been performed.  If we can't decide, return null.
+Constant *Evaluator::ComputeLoadResult(Constant *P) {
+  // If this memory location has been recently stored, use the stored value: it
+  // is the most up-to-date.
+  DenseMap<Constant*, Constant*>::const_iterator I = MutatedMemory.find(P);
+  if (I != MutatedMemory.end()) return I->second;
+
+  // Access it.
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) {
+    if (GV->hasDefinitiveInitializer())
+      return GV->getInitializer();
+    return nullptr;
+  }
+
+  // Handle a constantexpr getelementptr.
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(P))
+    if (CE->getOpcode() == Instruction::GetElementPtr &&
+        isa<GlobalVariable>(CE->getOperand(0))) {
+      GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
+      if (GV->hasDefinitiveInitializer())
+        return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE);
+    }
+
+  return nullptr;  // don't know how to evaluate.
+}
+
+/// Evaluate all instructions in block BB, returning true if successful, false
+/// if we can't evaluate it.  NewBB returns the next BB that control flows into,
+/// or null upon return.
+bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
+                              BasicBlock *&NextBB) {
+  // This is the main evaluation loop.
+  while (1) {
+    Constant *InstResult = nullptr;
+
+    DEBUG(dbgs() << "Evaluating Instruction: " << *CurInst << "\n");
+
+    if (StoreInst *SI = dyn_cast<StoreInst>(CurInst)) {
+      if (!SI->isSimple()) {
+        DEBUG(dbgs() << "Store is not simple! Can not evaluate.\n");
+        return false;  // no volatile/atomic accesses.
+      }
+      Constant *Ptr = getVal(SI->getOperand(1));
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
+        DEBUG(dbgs() << "Folding constant ptr expression: " << *Ptr);
+        Ptr = ConstantFoldConstantExpression(CE, DL, TLI);
+        DEBUG(dbgs() << "; To: " << *Ptr << "\n");
+      }
+      if (!isSimpleEnoughPointerToCommit(Ptr)) {
+        // If this is too complex for us to commit, reject it.
+        DEBUG(dbgs() << "Pointer is too complex for us to evaluate store.");
+        return false;
+      }
+
+      Constant *Val = getVal(SI->getOperand(0));
+
+      // If this might be too difficult for the backend to handle (e.g. the addr
+      // of one global variable divided by another) then we can't commit it.
+      if (!isSimpleEnoughValueToCommit(Val, SimpleConstants, DL)) {
+        DEBUG(dbgs() << "Store value is too complex to evaluate store. " << *Val
+              << "\n");
+        return false;
+      }
+
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
+        if (CE->getOpcode() == Instruction::BitCast) {
+          DEBUG(dbgs() << "Attempting to resolve bitcast on constant ptr.\n");
+          // If we're evaluating a store through a bitcast, then we need
+          // to pull the bitcast off the pointer type and push it onto the
+          // stored value.
+          Ptr = CE->getOperand(0);
+
+          Type *NewTy = cast<PointerType>(Ptr->getType())->getElementType();
+
+          // In order to push the bitcast onto the stored value, a bitcast
+          // from NewTy to Val's type must be legal.  If it's not, we can try
+          // introspecting NewTy to find a legal conversion.
+          while (!Val->getType()->canLosslesslyBitCastTo(NewTy)) {
+            // If NewTy is a struct, we can convert the pointer to the struct
+            // into a pointer to its first member.
+            // FIXME: This could be extended to support arrays as well.
+            if (StructType *STy = dyn_cast<StructType>(NewTy)) {
+              NewTy = STy->getTypeAtIndex(0U);
+
+              IntegerType *IdxTy = IntegerType::get(NewTy->getContext(), 32);
+              Constant *IdxZero = ConstantInt::get(IdxTy, 0, false);
+              Constant * const IdxList[] = {IdxZero, IdxZero};
+
+              Ptr = ConstantExpr::getGetElementPtr(nullptr, Ptr, IdxList);
+              if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
+                Ptr = ConstantFoldConstantExpression(CE, DL, TLI);
+
+            // If we can't improve the situation by introspecting NewTy,
+            // we have to give up.
+            } else {
+              DEBUG(dbgs() << "Failed to bitcast constant ptr, can not "
+                    "evaluate.\n");
+              return false;
+            }
+          }
+
+          // If we found compatible types, go ahead and push the bitcast
+          // onto the stored value.
+          Val = ConstantExpr::getBitCast(Val, NewTy);
+
+          DEBUG(dbgs() << "Evaluated bitcast: " << *Val << "\n");
+        }
+      }
+
+      MutatedMemory[Ptr] = Val;
+    } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(CurInst)) {
+      InstResult = ConstantExpr::get(BO->getOpcode(),
+                                     getVal(BO->getOperand(0)),
+                                     getVal(BO->getOperand(1)));
+      DEBUG(dbgs() << "Found a BinaryOperator! Simplifying: " << *InstResult
+            << "\n");
+    } else if (CmpInst *CI = dyn_cast<CmpInst>(CurInst)) {
+      InstResult = ConstantExpr::getCompare(CI->getPredicate(),
+                                            getVal(CI->getOperand(0)),
+                                            getVal(CI->getOperand(1)));
+      DEBUG(dbgs() << "Found a CmpInst! Simplifying: " << *InstResult
+            << "\n");
+    } else if (CastInst *CI = dyn_cast<CastInst>(CurInst)) {
+      InstResult = ConstantExpr::getCast(CI->getOpcode(),
+                                         getVal(CI->getOperand(0)),
+                                         CI->getType());
+      DEBUG(dbgs() << "Found a Cast! Simplifying: " << *InstResult
+            << "\n");
+    } else if (SelectInst *SI = dyn_cast<SelectInst>(CurInst)) {
+      InstResult = ConstantExpr::getSelect(getVal(SI->getOperand(0)),
+                                           getVal(SI->getOperand(1)),
+                                           getVal(SI->getOperand(2)));
+      DEBUG(dbgs() << "Found a Select! Simplifying: " << *InstResult
+            << "\n");
+    } else if (auto *EVI = dyn_cast<ExtractValueInst>(CurInst)) {
+      InstResult = ConstantExpr::getExtractValue(
+          getVal(EVI->getAggregateOperand()), EVI->getIndices());
+      DEBUG(dbgs() << "Found an ExtractValueInst! Simplifying: " << *InstResult
+                   << "\n");
+    } else if (auto *IVI = dyn_cast<InsertValueInst>(CurInst)) {
+      InstResult = ConstantExpr::getInsertValue(
+          getVal(IVI->getAggregateOperand()),
+          getVal(IVI->getInsertedValueOperand()), IVI->getIndices());
+      DEBUG(dbgs() << "Found an InsertValueInst! Simplifying: " << *InstResult
+                   << "\n");
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurInst)) {
+      Constant *P = getVal(GEP->getOperand(0));
+      SmallVector<Constant*, 8> GEPOps;
+      for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end();
+           i != e; ++i)
+        GEPOps.push_back(getVal(*i));
+      InstResult =
+          ConstantExpr::getGetElementPtr(GEP->getSourceElementType(), P, GEPOps,
+                                         cast<GEPOperator>(GEP)->isInBounds());
+      DEBUG(dbgs() << "Found a GEP! Simplifying: " << *InstResult
+            << "\n");
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(CurInst)) {
+
+      if (!LI->isSimple()) {
+        DEBUG(dbgs() << "Found a Load! Not a simple load, can not evaluate.\n");
+        return false;  // no volatile/atomic accesses.
+      }
+
+      Constant *Ptr = getVal(LI->getOperand(0));
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
+        Ptr = ConstantFoldConstantExpression(CE, DL, TLI);
+        DEBUG(dbgs() << "Found a constant pointer expression, constant "
+              "folding: " << *Ptr << "\n");
+      }
+      InstResult = ComputeLoadResult(Ptr);
+      if (!InstResult) {
+        DEBUG(dbgs() << "Failed to compute load result. Can not evaluate load."
+              "\n");
+        return false; // Could not evaluate load.
+      }
+
+      DEBUG(dbgs() << "Evaluated load: " << *InstResult << "\n");
+    } else if (AllocaInst *AI = dyn_cast<AllocaInst>(CurInst)) {
+      if (AI->isArrayAllocation()) {
+        DEBUG(dbgs() << "Found an array alloca. Can not evaluate.\n");
+        return false;  // Cannot handle array allocs.
+      }
+      Type *Ty = AI->getAllocatedType();
+      AllocaTmps.push_back(
+          make_unique<GlobalVariable>(Ty, false, GlobalValue::InternalLinkage,
+                                      UndefValue::get(Ty), AI->getName()));
+      InstResult = AllocaTmps.back().get();
+      DEBUG(dbgs() << "Found an alloca. Result: " << *InstResult << "\n");
+    } else if (isa<CallInst>(CurInst) || isa<InvokeInst>(CurInst)) {
+      CallSite CS(&*CurInst);
+
+      // Debug info can safely be ignored here.
+      if (isa<DbgInfoIntrinsic>(CS.getInstruction())) {
+        DEBUG(dbgs() << "Ignoring debug info.\n");
+        ++CurInst;
+        continue;
+      }
+
+      // Cannot handle inline asm.
+      if (isa<InlineAsm>(CS.getCalledValue())) {
+        DEBUG(dbgs() << "Found inline asm, can not evaluate.\n");
+        return false;
+      }
+
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
+        if (MemSetInst *MSI = dyn_cast<MemSetInst>(II)) {
+          if (MSI->isVolatile()) {
+            DEBUG(dbgs() << "Can not optimize a volatile memset " <<
+                  "intrinsic.\n");
+            return false;
+          }
+          Constant *Ptr = getVal(MSI->getDest());
+          Constant *Val = getVal(MSI->getValue());
+          Constant *DestVal = ComputeLoadResult(getVal(Ptr));
+          if (Val->isNullValue() && DestVal && DestVal->isNullValue()) {
+            // This memset is a no-op.
+            DEBUG(dbgs() << "Ignoring no-op memset.\n");
+            ++CurInst;
+            continue;
+          }
+        }
+
+        if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+            II->getIntrinsicID() == Intrinsic::lifetime_end) {
+          DEBUG(dbgs() << "Ignoring lifetime intrinsic.\n");
+          ++CurInst;
+          continue;
+        }
+
+        if (II->getIntrinsicID() == Intrinsic::invariant_start) {
+          // We don't insert an entry into Values, as it doesn't have a
+          // meaningful return value.
+          if (!II->use_empty()) {
+            DEBUG(dbgs() << "Found unused invariant_start. Can't evaluate.\n");
+            return false;
+          }
+          ConstantInt *Size = cast<ConstantInt>(II->getArgOperand(0));
+          Value *PtrArg = getVal(II->getArgOperand(1));
+          Value *Ptr = PtrArg->stripPointerCasts();
+          if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
+            Type *ElemTy = GV->getValueType();
+            if (!Size->isAllOnesValue() &&
+                Size->getValue().getLimitedValue() >=
+                    DL.getTypeStoreSize(ElemTy)) {
+              Invariants.insert(GV);
+              DEBUG(dbgs() << "Found a global var that is an invariant: " << *GV
+                    << "\n");
+            } else {
+              DEBUG(dbgs() << "Found a global var, but can not treat it as an "
+                    "invariant.\n");
+            }
+          }
+          // Continue even if we do nothing.
+          ++CurInst;
+          continue;
+        } else if (II->getIntrinsicID() == Intrinsic::assume) {
+          DEBUG(dbgs() << "Skipping assume intrinsic.\n");
+          ++CurInst;
+          continue;
+        }
+
+        DEBUG(dbgs() << "Unknown intrinsic. Can not evaluate.\n");
+        return false;
+      }
+
+      // Resolve function pointers.
+      Function *Callee = dyn_cast<Function>(getVal(CS.getCalledValue()));
+      if (!Callee || Callee->isInterposable()) {
+        DEBUG(dbgs() << "Can not resolve function pointer.\n");
+        return false;  // Cannot resolve.
+      }
+
+      SmallVector<Constant*, 8> Formals;
+      for (User::op_iterator i = CS.arg_begin(), e = CS.arg_end(); i != e; ++i)
+        Formals.push_back(getVal(*i));
+
+      if (Callee->isDeclaration()) {
+        // If this is a function we can constant fold, do it.
+        if (Constant *C = ConstantFoldCall(Callee, Formals, TLI)) {
+          InstResult = C;
+          DEBUG(dbgs() << "Constant folded function call. Result: " <<
+                *InstResult << "\n");
+        } else {
+          DEBUG(dbgs() << "Can not constant fold function call.\n");
+          return false;
+        }
+      } else {
+        if (Callee->getFunctionType()->isVarArg()) {
+          DEBUG(dbgs() << "Can not constant fold vararg function call.\n");
+          return false;
+        }
+
+        Constant *RetVal = nullptr;
+        // Execute the call, if successful, use the return value.
+        ValueStack.emplace_back();
+        if (!EvaluateFunction(Callee, RetVal, Formals)) {
+          DEBUG(dbgs() << "Failed to evaluate function.\n");
+          return false;
+        }
+        ValueStack.pop_back();
+        InstResult = RetVal;
+
+        if (InstResult) {
+          DEBUG(dbgs() << "Successfully evaluated function. Result: "
+                       << *InstResult << "\n\n");
+        } else {
+          DEBUG(dbgs() << "Successfully evaluated function. Result: 0\n\n");
+        }
+      }
+    } else if (isa<TerminatorInst>(CurInst)) {
+      DEBUG(dbgs() << "Found a terminator instruction.\n");
+
+      if (BranchInst *BI = dyn_cast<BranchInst>(CurInst)) {
+        if (BI->isUnconditional()) {
+          NextBB = BI->getSuccessor(0);
+        } else {
+          ConstantInt *Cond =
+            dyn_cast<ConstantInt>(getVal(BI->getCondition()));
+          if (!Cond) return false;  // Cannot determine.
+
+          NextBB = BI->getSuccessor(!Cond->getZExtValue());
+        }
+      } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurInst)) {
+        ConstantInt *Val =
+          dyn_cast<ConstantInt>(getVal(SI->getCondition()));
+        if (!Val) return false;  // Cannot determine.
+        NextBB = SI->findCaseValue(Val).getCaseSuccessor();
+      } else if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(CurInst)) {
+        Value *Val = getVal(IBI->getAddress())->stripPointerCasts();
+        if (BlockAddress *BA = dyn_cast<BlockAddress>(Val))
+          NextBB = BA->getBasicBlock();
+        else
+          return false;  // Cannot determine.
+      } else if (isa<ReturnInst>(CurInst)) {
+        NextBB = nullptr;
+      } else {
+        // invoke, unwind, resume, unreachable.
+        DEBUG(dbgs() << "Can not handle terminator.");
+        return false;  // Cannot handle this terminator.
+      }
+
+      // We succeeded at evaluating this block!
+      DEBUG(dbgs() << "Successfully evaluated block.\n");
+      return true;
+    } else {
+      // Did not know how to evaluate this!
+      DEBUG(dbgs() << "Failed to evaluate block due to unhandled instruction."
+            "\n");
+      return false;
+    }
+
+    if (!CurInst->use_empty()) {
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(InstResult))
+        InstResult = ConstantFoldConstantExpression(CE, DL, TLI);
+
+      setVal(&*CurInst, InstResult);
+    }
+
+    // If we just processed an invoke, we finished evaluating the block.
+    if (InvokeInst *II = dyn_cast<InvokeInst>(CurInst)) {
+      NextBB = II->getNormalDest();
+      DEBUG(dbgs() << "Found an invoke instruction. Finished Block.\n\n");
+      return true;
+    }
+
+    // Advance program counter.
+    ++CurInst;
+  }
+}
+
+/// Evaluate a call to function F, returning true if successful, false if we
+/// can't evaluate it.  ActualArgs contains the formal arguments for the
+/// function.
+bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,
+                                 const SmallVectorImpl<Constant*> &ActualArgs) {
+  // Check to see if this function is already executing (recursion).  If so,
+  // bail out.  TODO: we might want to accept limited recursion.
+  if (std::find(CallStack.begin(), CallStack.end(), F) != CallStack.end())
+    return false;
+
+  CallStack.push_back(F);
+
+  // Initialize arguments to the incoming values specified.
+  unsigned ArgNo = 0;
+  for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); AI != E;
+       ++AI, ++ArgNo)
+    setVal(&*AI, ActualArgs[ArgNo]);
+
+  // ExecutedBlocks - We only handle non-looping, non-recursive code.  As such,
+  // we can only evaluate any one basic block at most once.  This set keeps
+  // track of what we have executed so we can detect recursive cases etc.
+  SmallPtrSet<BasicBlock*, 32> ExecutedBlocks;
+
+  // CurBB - The current basic block we're evaluating.
+  BasicBlock *CurBB = &F->front();
+
+  BasicBlock::iterator CurInst = CurBB->begin();
+
+  while (1) {
+    BasicBlock *NextBB = nullptr; // Initialized to avoid compiler warnings.
+    DEBUG(dbgs() << "Trying to evaluate BB: " << *CurBB << "\n");
+
+    if (!EvaluateBlock(CurInst, NextBB))
+      return false;
+
+    if (!NextBB) {
+      // Successfully running until there's no next block means that we found
+      // the return.  Fill it the return value and pop the call stack.
+      ReturnInst *RI = cast<ReturnInst>(CurBB->getTerminator());
+      if (RI->getNumOperands())
+        RetVal = getVal(RI->getOperand(0));
+      CallStack.pop_back();
+      return true;
+    }
+
+    // Okay, we succeeded in evaluating this control flow.  See if we have
+    // executed the new block before.  If so, we have a looping function,
+    // which we cannot evaluate in reasonable time.
+    if (!ExecutedBlocks.insert(NextBB).second)
+      return false;  // looped!
+
+    // Okay, we have never been in this block before.  Check to see if there
+    // are any PHI nodes.  If so, evaluate them with information about where
+    // we came from.
+    PHINode *PN = nullptr;
+    for (CurInst = NextBB->begin();
+         (PN = dyn_cast<PHINode>(CurInst)); ++CurInst)
+      setVal(PN, getVal(PN->getIncomingValueForBlock(CurBB)));
+
+    // Advance to the next block.
+    CurBB = NextBB;
+  }
+}
+
diff --git a/lib/Transforms/Utils/FunctionImportUtils.cpp b/lib/Transforms/Utils/FunctionImportUtils.cpp
new file mode 100644
index 0000000000000..fcb25baf32167
--- /dev/null
+++ b/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -0,0 +1,243 @@
+//===- lib/Transforms/Utils/FunctionImportUtils.cpp - Importing utilities -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the FunctionImportGlobalProcessing class, used
+// to perform the necessary global value handling for function importing.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/ModuleSummaryAnalysis.h"
+#include "llvm/Transforms/Utils/FunctionImportUtils.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+using namespace llvm;
+
+/// Checks if we should import SGV as a definition, otherwise import as a
+/// declaration.
+bool FunctionImportGlobalProcessing::doImportAsDefinition(
+    const GlobalValue *SGV, DenseSet<const GlobalValue *> *GlobalsToImport) {
+
+  // For alias, we tie the definition to the base object. Extract it and recurse
+  if (auto *GA = dyn_cast<GlobalAlias>(SGV)) {
+    if (GA->hasWeakAnyLinkage())
+      return false;
+    const GlobalObject *GO = GA->getBaseObject();
+    if (!GO->hasLinkOnceODRLinkage())
+      return false;
+    return FunctionImportGlobalProcessing::doImportAsDefinition(
+        GO, GlobalsToImport);
+  }
+  // Only import the globals requested for importing.
+  if (GlobalsToImport->count(SGV))
+    return true;
+  // Otherwise no.
+  return false;
+}
+
+bool FunctionImportGlobalProcessing::doImportAsDefinition(
+    const GlobalValue *SGV) {
+  if (!isPerformingImport())
+    return false;
+  return FunctionImportGlobalProcessing::doImportAsDefinition(SGV,
+                                                              GlobalsToImport);
+}
+
+bool FunctionImportGlobalProcessing::doPromoteLocalToGlobal(
+    const GlobalValue *SGV) {
+  assert(SGV->hasLocalLinkage());
+  // Both the imported references and the original local variable must
+  // be promoted.
+  if (!isPerformingImport() && !isModuleExporting())
+    return false;
+
+  // Local const variables never need to be promoted unless they are address
+  // taken. The imported uses can simply use the clone created in this module.
+  // For now we are conservative in determining which variables are not
+  // address taken by checking the unnamed addr flag. To be more aggressive,
+  // the address taken information must be checked earlier during parsing
+  // of the module and recorded in the summary index for use when importing
+  // from that module.
+  auto *GVar = dyn_cast<GlobalVariable>(SGV);
+  if (GVar && GVar->isConstant() && GVar->hasGlobalUnnamedAddr())
+    return false;
+
+  if (GVar && GVar->hasSection())
+    // Some sections like "__DATA,__cfstring" are "magic" and promotion is not
+    // allowed. Just disable promotion on any GVar with sections right now.
+    return false;
+
+  // Eventually we only need to promote functions in the exporting module that
+  // are referenced by a potentially exported function (i.e. one that is in the
+  // summary index).
+  return true;
+}
+
+std::string FunctionImportGlobalProcessing::getName(const GlobalValue *SGV) {
+  // For locals that must be promoted to global scope, ensure that
+  // the promoted name uniquely identifies the copy in the original module,
+  // using the ID assigned during combined index creation. When importing,
+  // we rename all locals (not just those that are promoted) in order to
+  // avoid naming conflicts between locals imported from different modules.
+  if (SGV->hasLocalLinkage() &&
+      (doPromoteLocalToGlobal(SGV) || isPerformingImport()))
+    return ModuleSummaryIndex::getGlobalNameForLocal(
+        SGV->getName(),
+        ImportIndex.getModuleHash(SGV->getParent()->getModuleIdentifier()));
+  return SGV->getName();
+}
+
+GlobalValue::LinkageTypes
+FunctionImportGlobalProcessing::getLinkage(const GlobalValue *SGV) {
+  // Any local variable that is referenced by an exported function needs
+  // to be promoted to global scope. Since we don't currently know which
+  // functions reference which local variables/functions, we must treat
+  // all as potentially exported if this module is exporting anything.
+  if (isModuleExporting()) {
+    if (SGV->hasLocalLinkage() && doPromoteLocalToGlobal(SGV))
+      return GlobalValue::ExternalLinkage;
+    return SGV->getLinkage();
+  }
+
+  // Otherwise, if we aren't importing, no linkage change is needed.
+  if (!isPerformingImport())
+    return SGV->getLinkage();
+
+  switch (SGV->getLinkage()) {
+  case GlobalValue::ExternalLinkage:
+    // External defnitions are converted to available_externally
+    // definitions upon import, so that they are available for inlining
+    // and/or optimization, but are turned into declarations later
+    // during the EliminateAvailableExternally pass.
+    if (doImportAsDefinition(SGV) && !dyn_cast<GlobalAlias>(SGV))
+      return GlobalValue::AvailableExternallyLinkage;
+    // An imported external declaration stays external.
+    return SGV->getLinkage();
+
+  case GlobalValue::AvailableExternallyLinkage:
+    // An imported available_externally definition converts
+    // to external if imported as a declaration.
+    if (!doImportAsDefinition(SGV))
+      return GlobalValue::ExternalLinkage;
+    // An imported available_externally declaration stays that way.
+    return SGV->getLinkage();
+
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage:
+    // These both stay the same when importing the definition.
+    // The ThinLTO pass will eventually force-import their definitions.
+    return SGV->getLinkage();
+
+  case GlobalValue::WeakAnyLinkage:
+    // Can't import weak_any definitions correctly, or we might change the
+    // program semantics, since the linker will pick the first weak_any
+    // definition and importing would change the order they are seen by the
+    // linker. The module linking caller needs to enforce this.
+    assert(!doImportAsDefinition(SGV));
+    // If imported as a declaration, it becomes external_weak.
+    return SGV->getLinkage();
+
+  case GlobalValue::WeakODRLinkage:
+    // For weak_odr linkage, there is a guarantee that all copies will be
+    // equivalent, so the issue described above for weak_any does not exist,
+    // and the definition can be imported. It can be treated similarly
+    // to an imported externally visible global value.
+    if (doImportAsDefinition(SGV) && !dyn_cast<GlobalAlias>(SGV))
+      return GlobalValue::AvailableExternallyLinkage;
+    else
+      return GlobalValue::ExternalLinkage;
+
+  case GlobalValue::AppendingLinkage:
+    // It would be incorrect to import an appending linkage variable,
+    // since it would cause global constructors/destructors to be
+    // executed multiple times. This should have already been handled
+    // by linkIfNeeded, and we will assert in shouldLinkFromSource
+    // if we try to import, so we simply return AppendingLinkage.
+    return GlobalValue::AppendingLinkage;
+
+  case GlobalValue::InternalLinkage:
+  case GlobalValue::PrivateLinkage:
+    // If we are promoting the local to global scope, it is handled
+    // similarly to a normal externally visible global.
+    if (doPromoteLocalToGlobal(SGV)) {
+      if (doImportAsDefinition(SGV) && !dyn_cast<GlobalAlias>(SGV))
+        return GlobalValue::AvailableExternallyLinkage;
+      else
+        return GlobalValue::ExternalLinkage;
+    }
+    // A non-promoted imported local definition stays local.
+    // The ThinLTO pass will eventually force-import their definitions.
+    return SGV->getLinkage();
+
+  case GlobalValue::ExternalWeakLinkage:
+    // External weak doesn't apply to definitions, must be a declaration.
+    assert(!doImportAsDefinition(SGV));
+    // Linkage stays external_weak.
+    return SGV->getLinkage();
+
+  case GlobalValue::CommonLinkage:
+    // Linkage stays common on definitions.
+    // The ThinLTO pass will eventually force-import their definitions.
+    return SGV->getLinkage();
+  }
+
+  llvm_unreachable("unknown linkage type");
+}
+
+void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) {
+  if (GV.hasLocalLinkage() &&
+      (doPromoteLocalToGlobal(&GV) || isPerformingImport())) {
+    GV.setName(getName(&GV));
+    GV.setLinkage(getLinkage(&GV));
+    if (!GV.hasLocalLinkage())
+      GV.setVisibility(GlobalValue::HiddenVisibility);
+  } else
+    GV.setLinkage(getLinkage(&GV));
+
+  // Remove functions imported as available externally defs from comdats,
+  // as this is a declaration for the linker, and will be dropped eventually.
+  // It is illegal for comdats to contain declarations.
+  auto *GO = dyn_cast_or_null<GlobalObject>(&GV);
+  if (GO && GO->isDeclarationForLinker() && GO->hasComdat()) {
+    // The IRMover should not have placed any imported declarations in
+    // a comdat, so the only declaration that should be in a comdat
+    // at this point would be a definition imported as available_externally.
+    assert(GO->hasAvailableExternallyLinkage() &&
+           "Expected comdat on definition (possibly available external)");
+    GO->setComdat(nullptr);
+  }
+}
+
+void FunctionImportGlobalProcessing::processGlobalsForThinLTO() {
+  if (!moduleCanBeRenamedForThinLTO(M)) {
+    // We would have blocked importing from this module by suppressing index
+    // generation. We still may be able to import into this module though.
+    assert(!isPerformingImport() &&
+           "Should have blocked importing from module with local used in ASM");
+    return;
+  }
+
+  for (GlobalVariable &GV : M.globals())
+    processGlobalForThinLTO(GV);
+  for (Function &SF : M)
+    processGlobalForThinLTO(SF);
+  for (GlobalAlias &GA : M.aliases())
+    processGlobalForThinLTO(GA);
+}
+
+bool FunctionImportGlobalProcessing::run() {
+  processGlobalsForThinLTO();
+  return false;
+}
+
+bool llvm::renameModuleForThinLTO(
+    Module &M, const ModuleSummaryIndex &Index,
+    DenseSet<const GlobalValue *> *GlobalsToImport) {
+  FunctionImportGlobalProcessing ThinLTOProcessing(M, Index, GlobalsToImport);
+  return ThinLTOProcessing.run();
+}
diff --git a/lib/Transforms/Utils/GlobalStatus.cpp b/lib/Transforms/Utils/GlobalStatus.cpp
index 3893a752503b6..266be41fbeadd 100644
--- a/lib/Transforms/Utils/GlobalStatus.cpp
+++ b/lib/Transforms/Utils/GlobalStatus.cpp
@@ -20,11 +20,11 @@ using namespace llvm;
 /// and release, then return AcquireRelease.
 ///
 static AtomicOrdering strongerOrdering(AtomicOrdering X, AtomicOrdering Y) {
-  if (X == Acquire && Y == Release)
-    return AcquireRelease;
-  if (Y == Acquire && X == Release)
-    return AcquireRelease;
-  return (AtomicOrdering)std::max(X, Y);
+  if (X == AtomicOrdering::Acquire && Y == AtomicOrdering::Release)
+    return AtomicOrdering::AcquireRelease;
+  if (Y == AtomicOrdering::Acquire && X == AtomicOrdering::Release)
+    return AtomicOrdering::AcquireRelease;
+  return (AtomicOrdering)std::max((unsigned)X, (unsigned)Y);
 }
 
 /// It is safe to destroy a constant iff it is only used by constants itself.
@@ -105,7 +105,7 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
               }
             }
 
-            if (StoredVal == GV->getInitializer()) {
+            if (GV->hasInitializer() && StoredVal == GV->getInitializer()) {
               if (GS.StoredType < GlobalStatus::InitializerStored)
                 GS.StoredType = GlobalStatus::InitializerStored;
             } else if (isa<LoadInst>(StoredVal) &&
@@ -185,4 +185,4 @@ GlobalStatus::GlobalStatus()
     : IsCompared(false), IsLoaded(false), StoredType(NotStored),
       StoredOnceValue(nullptr), AccessingFunction(nullptr),
       HasMultipleAccessingFunctions(false), HasNonInstructionUser(false),
-      Ordering(NotAtomic) {}
+      Ordering(AtomicOrdering::NotAtomic) {}
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 79282a2a703b3..1fbb19d2b8add 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -427,6 +427,17 @@ static BasicBlock *HandleCallsInBlockInlinedThroughInvoke(
     if (!CI || CI->doesNotThrow() || isa<InlineAsm>(CI->getCalledValue()))
       continue;
 
+    // We do not need to (and in fact, cannot) convert possibly throwing calls
+    // to @llvm.experimental_deoptimize (resp. @llvm.experimental.guard) into
+    // invokes.  The caller's "segment" of the deoptimization continuation
+    // attached to the newly inlined @llvm.experimental_deoptimize
+    // (resp. @llvm.experimental.guard) call should contain the exception
+    // handling logic, if any.
+    if (auto *F = CI->getCalledFunction())
+      if (F->getIntrinsicID() == Intrinsic::experimental_deoptimize ||
+          F->getIntrinsicID() == Intrinsic::experimental_guard)
+        continue;
+
     if (auto FuncletBundle = CI->getOperandBundle(LLVMContext::OB_funclet)) {
       // This call is nested inside a funclet.  If that funclet has an unwind
       // destination within the inlinee, then unwinding out of this call would
@@ -677,6 +688,34 @@ static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock,
   UnwindDest->removePredecessor(InvokeBB);
 }
 
+/// When inlining a call site that has !llvm.mem.parallel_loop_access metadata,
+/// that metadata should be propagated to all memory-accessing cloned
+/// instructions.
+static void PropagateParallelLoopAccessMetadata(CallSite CS,
+                                                ValueToValueMapTy &VMap) {
+  MDNode *M =
+    CS.getInstruction()->getMetadata(LLVMContext::MD_mem_parallel_loop_access);
+  if (!M)
+    return;
+
+  for (ValueToValueMapTy::iterator VMI = VMap.begin(), VMIE = VMap.end();
+       VMI != VMIE; ++VMI) {
+    if (!VMI->second)
+      continue;
+
+    Instruction *NI = dyn_cast<Instruction>(VMI->second);
+    if (!NI)
+      continue;
+
+    if (MDNode *PM = NI->getMetadata(LLVMContext::MD_mem_parallel_loop_access)) {
+        M = MDNode::concatenate(PM, M);
+      NI->setMetadata(LLVMContext::MD_mem_parallel_loop_access, M);
+    } else if (NI->mayReadOrWriteMemory()) {
+      NI->setMetadata(LLVMContext::MD_mem_parallel_loop_access, M);
+    }
+  }
+}
+
 /// When inlining a function that contains noalias scope metadata,
 /// this metadata needs to be cloned so that the inlined blocks
 /// have different "unqiue scopes" at every call site. Were this not done, then
@@ -693,12 +732,11 @@ static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) {
   // inter-procedural alias analysis passes. We can revisit this if it becomes
   // an efficiency or overhead problem.
 
-  for (Function::const_iterator I = CalledFunc->begin(), IE = CalledFunc->end();
-       I != IE; ++I)
-    for (BasicBlock::const_iterator J = I->begin(), JE = I->end(); J != JE; ++J) {
-      if (const MDNode *M = J->getMetadata(LLVMContext::MD_alias_scope))
+  for (const BasicBlock &I : *CalledFunc)
+    for (const Instruction &J : I) {
+      if (const MDNode *M = J.getMetadata(LLVMContext::MD_alias_scope))
         MD.insert(M);
-      if (const MDNode *M = J->getMetadata(LLVMContext::MD_noalias))
+      if (const MDNode *M = J.getMetadata(LLVMContext::MD_noalias))
         MD.insert(M);
     }
 
@@ -720,20 +758,18 @@ static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) {
   // the noalias scopes and the lists of those scopes.
   SmallVector<TempMDTuple, 16> DummyNodes;
   DenseMap<const MDNode *, TrackingMDNodeRef> MDMap;
-  for (SetVector<const MDNode *>::iterator I = MD.begin(), IE = MD.end();
-       I != IE; ++I) {
+  for (const MDNode *I : MD) {
     DummyNodes.push_back(MDTuple::getTemporary(CalledFunc->getContext(), None));
-    MDMap[*I].reset(DummyNodes.back().get());
+    MDMap[I].reset(DummyNodes.back().get());
   }
 
   // Create new metadata nodes to replace the dummy nodes, replacing old
   // metadata references with either a dummy node or an already-created new
   // node.
-  for (SetVector<const MDNode *>::iterator I = MD.begin(), IE = MD.end();
-       I != IE; ++I) {
+  for (const MDNode *I : MD) {
     SmallVector<Metadata *, 4> NewOps;
-    for (unsigned i = 0, ie = (*I)->getNumOperands(); i != ie; ++i) {
-      const Metadata *V = (*I)->getOperand(i);
+    for (unsigned i = 0, ie = I->getNumOperands(); i != ie; ++i) {
+      const Metadata *V = I->getOperand(i);
       if (const MDNode *M = dyn_cast<MDNode>(V))
         NewOps.push_back(MDMap[M]);
       else
@@ -741,7 +777,7 @@ static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) {
     }
 
     MDNode *NewM = MDNode::get(CalledFunc->getContext(), NewOps);
-    MDTuple *TempM = cast<MDTuple>(MDMap[*I]);
+    MDTuple *TempM = cast<MDTuple>(MDMap[I]);
     assert(TempM->isTemporary() && "Expected temporary node");
 
     TempM->replaceAllUsesWith(NewM);
@@ -801,10 +837,9 @@ static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap,
   const Function *CalledFunc = CS.getCalledFunction();
   SmallVector<const Argument *, 4> NoAliasArgs;
 
-  for (const Argument &I : CalledFunc->args()) {
-    if (I.hasNoAliasAttr() && !I.hasNUses(0))
-      NoAliasArgs.push_back(&I);
-  }
+  for (const Argument &Arg : CalledFunc->args())
+    if (Arg.hasNoAliasAttr() && !Arg.use_empty())
+      NoAliasArgs.push_back(&Arg);
 
   if (NoAliasArgs.empty())
     return;
@@ -885,17 +920,16 @@ static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap,
             IsArgMemOnlyCall = true;
         }
 
-        for (ImmutableCallSite::arg_iterator AI = ICS.arg_begin(),
-             AE = ICS.arg_end(); AI != AE; ++AI) {
+        for (Value *Arg : ICS.args()) {
           // We need to check the underlying objects of all arguments, not just
           // the pointer arguments, because we might be passing pointers as
           // integers, etc.
           // However, if we know that the call only accesses pointer arguments,
           // then we only need to check the pointer arguments.
-          if (IsArgMemOnlyCall && !(*AI)->getType()->isPointerTy())
+          if (IsArgMemOnlyCall && !Arg->getType()->isPointerTy())
             continue;
 
-          PtrArgs.push_back(*AI);
+          PtrArgs.push_back(Arg);
         }
       }
 
@@ -913,9 +947,9 @@ static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap,
       SmallVector<Metadata *, 4> Scopes, NoAliases;
 
       SmallSetVector<const Argument *, 4> NAPtrArgs;
-      for (unsigned i = 0, ie = PtrArgs.size(); i != ie; ++i) {
+      for (const Value *V : PtrArgs) {
         SmallVector<Value *, 4> Objects;
-        GetUnderlyingObjects(const_cast<Value*>(PtrArgs[i]),
+        GetUnderlyingObjects(const_cast<Value*>(V),
                              Objects, DL, /* LI = */ nullptr);
 
         for (Value *O : Objects)
@@ -1228,7 +1262,8 @@ static bool hasLifetimeMarkers(AllocaInst *AI) {
 /// Rebuild the entire inlined-at chain for this instruction so that the top of
 /// the chain now is inlined-at the new call site.
 static DebugLoc
-updateInlinedAtInfo(DebugLoc DL, DILocation *InlinedAtNode, LLVMContext &Ctx,
+updateInlinedAtInfo(const DebugLoc &DL, DILocation *InlinedAtNode,
+                    LLVMContext &Ctx,
                     DenseMap<const DILocation *, DILocation *> &IANodes) {
   SmallVector<DILocation *, 3> InlinedAtLocations;
   DILocation *Last = InlinedAtNode;
@@ -1249,8 +1284,7 @@ updateInlinedAtInfo(DebugLoc DL, DILocation *InlinedAtNode, LLVMContext &Ctx,
   // Starting from the top, rebuild the nodes to point to the new inlined-at
   // location (then rebuilding the rest of the chain behind it) and update the
   // map of already-constructed inlined-at nodes.
-  for (const DILocation *MD : make_range(InlinedAtLocations.rbegin(),
-                                         InlinedAtLocations.rend())) {
+  for (const DILocation *MD : reverse(InlinedAtLocations)) {
     Last = IANodes[MD] = DILocation::getDistinct(
         Ctx, MD->getLine(), MD->getColumn(), MD->getScope(), Last);
   }
@@ -1264,7 +1298,7 @@ updateInlinedAtInfo(DebugLoc DL, DILocation *InlinedAtNode, LLVMContext &Ctx,
 /// to encode location where these instructions are inlined.
 static void fixupLineNumbers(Function *Fn, Function::iterator FI,
                              Instruction *TheCall) {
-  DebugLoc TheCallDL = TheCall->getDebugLoc();
+  const DebugLoc &TheCallDL = TheCall->getDebugLoc();
   if (!TheCallDL)
     return;
 
@@ -1422,6 +1456,19 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     }
   }
 
+  // Determine if we are dealing with a call in an EHPad which does not unwind
+  // to caller.
+  bool EHPadForCallUnwindsLocally = false;
+  if (CallSiteEHPad && CS.isCall()) {
+    UnwindDestMemoTy FuncletUnwindMap;
+    Value *CallSiteUnwindDestToken =
+        getUnwindDestToken(CallSiteEHPad, FuncletUnwindMap);
+
+    EHPadForCallUnwindsLocally =
+        CallSiteUnwindDestToken &&
+        !isa<ConstantTokenNone>(CallSiteUnwindDestToken);
+  }
+
   // Get an iterator to the last basic block in the function, which will have
   // the new function inlined after it.
   Function::iterator LastBlock = --Caller->end();
@@ -1552,6 +1599,9 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     // Add noalias metadata if necessary.
     AddAliasScopeMetadata(CS, VMap, DL, CalleeAAR);
 
+    // Propagate llvm.mem.parallel_loop_access if necessary.
+    PropagateParallelLoopAccessMetadata(CS, VMap);
+
     // FIXME: We could register any cloned assumptions instead of clearing the
     // whole function's cache.
     if (IFI.ACT)
@@ -1602,7 +1652,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
       replaceDbgDeclareForAlloca(AI, AI, DIB, /*Deref=*/false);
   }
 
-  bool InlinedMustTailCalls = false;
+  bool InlinedMustTailCalls = false, InlinedDeoptimizeCalls = false;
   if (InlinedFunctionInfo.ContainsCalls) {
     CallInst::TailCallKind CallSiteTailKind = CallInst::TCK_None;
     if (CallInst *CI = dyn_cast<CallInst>(TheCall))
@@ -1615,6 +1665,10 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
         if (!CI)
           continue;
 
+        if (Function *F = CI->getCalledFunction())
+          InlinedDeoptimizeCalls |=
+              F->getIntrinsicID() == Intrinsic::experimental_deoptimize;
+
         // We need to reduce the strength of any inlined tail calls.  For
         // musttail, we have to avoid introducing potential unbounded stack
         // growth.  For example, if functions 'f' and 'g' are mutually recursive
@@ -1677,11 +1731,14 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
 
       builder.CreateLifetimeStart(AI, AllocaSize);
       for (ReturnInst *RI : Returns) {
-        // Don't insert llvm.lifetime.end calls between a musttail call and a
-        // return.  The return kills all local allocas.
+        // Don't insert llvm.lifetime.end calls between a musttail or deoptimize
+        // call and a return.  The return kills all local allocas.
         if (InlinedMustTailCalls &&
             RI->getParent()->getTerminatingMustTailCall())
           continue;
+        if (InlinedDeoptimizeCalls &&
+            RI->getParent()->getTerminatingDeoptimizeCall())
+          continue;
         IRBuilder<>(RI).CreateLifetimeEnd(AI, AllocaSize);
       }
     }
@@ -1702,10 +1759,12 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     // Insert a call to llvm.stackrestore before any return instructions in the
     // inlined function.
     for (ReturnInst *RI : Returns) {
-      // Don't insert llvm.stackrestore calls between a musttail call and a
-      // return.  The return will restore the stack pointer.
+      // Don't insert llvm.stackrestore calls between a musttail or deoptimize
+      // call and a return.  The return will restore the stack pointer.
       if (InlinedMustTailCalls && RI->getParent()->getTerminatingMustTailCall())
         continue;
+      if (InlinedDeoptimizeCalls && RI->getParent()->getTerminatingDeoptimizeCall())
+        continue;
       IRBuilder<>(RI).CreateCall(StackRestore, SavedPtr);
     }
   }
@@ -1758,7 +1817,6 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
           NewInst = CallInst::Create(cast<CallInst>(I), OpBundles, I);
         else
           NewInst = InvokeInst::Create(cast<InvokeInst>(I), OpBundles, I);
-        NewInst->setDebugLoc(I->getDebugLoc());
         NewInst->takeName(I);
         I->replaceAllUsesWith(NewInst);
         I->eraseFromParent();
@@ -1766,6 +1824,14 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
         OpBundles.clear();
       }
 
+      // It is problematic if the inlinee has a cleanupret which unwinds to
+      // caller and we inline it into a call site which doesn't unwind but into
+      // an EH pad that does.  Such an edge must be dynamically unreachable.
+      // As such, we replace the cleanupret with unreachable.
+      if (auto *CleanupRet = dyn_cast<CleanupReturnInst>(BB->getTerminator()))
+        if (CleanupRet->unwindsToCaller() && EHPadForCallUnwindsLocally)
+          changeToUnreachable(CleanupRet, /*UseLLVMTrap=*/false);
+
       Instruction *I = BB->getFirstNonPHI();
       if (!I->isEHPad())
         continue;
@@ -1781,6 +1847,64 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     }
   }
 
+  if (InlinedDeoptimizeCalls) {
+    // We need to at least remove the deoptimizing returns from the Return set,
+    // so that the control flow from those returns does not get merged into the
+    // caller (but terminate it instead).  If the caller's return type does not
+    // match the callee's return type, we also need to change the return type of
+    // the intrinsic.
+    if (Caller->getReturnType() == TheCall->getType()) {
+      auto NewEnd = remove_if(Returns, [](ReturnInst *RI) {
+        return RI->getParent()->getTerminatingDeoptimizeCall() != nullptr;
+      });
+      Returns.erase(NewEnd, Returns.end());
+    } else {
+      SmallVector<ReturnInst *, 8> NormalReturns;
+      Function *NewDeoptIntrinsic = Intrinsic::getDeclaration(
+          Caller->getParent(), Intrinsic::experimental_deoptimize,
+          {Caller->getReturnType()});
+
+      for (ReturnInst *RI : Returns) {
+        CallInst *DeoptCall = RI->getParent()->getTerminatingDeoptimizeCall();
+        if (!DeoptCall) {
+          NormalReturns.push_back(RI);
+          continue;
+        }
+
+        // The calling convention on the deoptimize call itself may be bogus,
+        // since the code we're inlining may have undefined behavior (and may
+        // never actually execute at runtime); but all
+        // @llvm.experimental.deoptimize declarations have to have the same
+        // calling convention in a well-formed module.
+        auto CallingConv = DeoptCall->getCalledFunction()->getCallingConv();
+        NewDeoptIntrinsic->setCallingConv(CallingConv);
+        auto *CurBB = RI->getParent();
+        RI->eraseFromParent();
+
+        SmallVector<Value *, 4> CallArgs(DeoptCall->arg_begin(),
+                                         DeoptCall->arg_end());
+
+        SmallVector<OperandBundleDef, 1> OpBundles;
+        DeoptCall->getOperandBundlesAsDefs(OpBundles);
+        DeoptCall->eraseFromParent();
+        assert(!OpBundles.empty() &&
+               "Expected at least the deopt operand bundle");
+
+        IRBuilder<> Builder(CurBB);
+        CallInst *NewDeoptCall =
+            Builder.CreateCall(NewDeoptIntrinsic, CallArgs, OpBundles);
+        NewDeoptCall->setCallingConv(CallingConv);
+        if (NewDeoptCall->getType()->isVoidTy())
+          Builder.CreateRetVoid();
+        else
+          Builder.CreateRet(NewDeoptCall);
+      }
+
+      // Leave behind the normal returns so we can merge control flow.
+      std::swap(Returns, NormalReturns);
+    }
+  }
+
   // Handle any inlined musttail call sites.  In order for a new call site to be
   // musttail, the source of the clone and the inlined call site must have been
   // musttail.  Therefore it's safe to return without merging control into the
diff --git a/lib/Transforms/Utils/InstructionNamer.cpp b/lib/Transforms/Utils/InstructionNamer.cpp
index da890a2970051..8a1973d1db051 100644
--- a/lib/Transforms/Utils/InstructionNamer.cpp
+++ b/lib/Transforms/Utils/InstructionNamer.cpp
@@ -37,13 +37,13 @@ namespace {
         if (!AI->hasName() && !AI->getType()->isVoidTy())
           AI->setName("arg");
 
-      for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-        if (!BB->hasName())
-          BB->setName("bb");
-        
-        for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
-          if (!I->hasName() && !I->getType()->isVoidTy())
-            I->setName("tmp");
+      for (BasicBlock &BB : F) {
+        if (!BB.hasName())
+          BB.setName("bb");
+
+        for (Instruction &I : BB)
+          if (!I.hasName() && !I.getType()->isVoidTy())
+            I.setName("tmp");
       }
       return true;
     }
diff --git a/lib/Transforms/Utils/IntegerDivision.cpp b/lib/Transforms/Utils/IntegerDivision.cpp
index 5687afa61e2a6..5a90dcb033b2a 100644
--- a/lib/Transforms/Utils/IntegerDivision.cpp
+++ b/lib/Transforms/Utils/IntegerDivision.cpp
@@ -390,6 +390,8 @@ bool llvm::expandRemainder(BinaryOperator *Rem) {
     Value *Remainder = generateSignedRemainderCode(Rem->getOperand(0),
                                                    Rem->getOperand(1), Builder);
 
+    // Check whether this is the insert point while Rem is still valid.
+    bool IsInsertPoint = Rem->getIterator() == Builder.GetInsertPoint();
     Rem->replaceAllUsesWith(Remainder);
     Rem->dropAllReferences();
     Rem->eraseFromParent();
@@ -397,7 +399,7 @@ bool llvm::expandRemainder(BinaryOperator *Rem) {
     // If we didn't actually generate an urem instruction, we're done
     // This happens for example if the input were constant. In this case the
     // Builder insertion point was unchanged
-    if (Rem == Builder.GetInsertPoint().getNodePtrUnchecked())
+    if (IsInsertPoint)
       return true;
 
     BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint());
@@ -446,6 +448,9 @@ bool llvm::expandDivision(BinaryOperator *Div) {
     // Lower the code to unsigned division, and reset Div to point to the udiv.
     Value *Quotient = generateSignedDivisionCode(Div->getOperand(0),
                                                  Div->getOperand(1), Builder);
+
+    // Check whether this is the insert point while Div is still valid.
+    bool IsInsertPoint = Div->getIterator() == Builder.GetInsertPoint();
     Div->replaceAllUsesWith(Quotient);
     Div->dropAllReferences();
     Div->eraseFromParent();
@@ -453,7 +458,7 @@ bool llvm::expandDivision(BinaryOperator *Div) {
     // If we didn't actually generate an udiv instruction, we're done
     // This happens for example if the input were constant. In this case the
     // Builder insertion point was unchanged
-    if (Div == Builder.GetInsertPoint().getNodePtrUnchecked())
+    if (IsInsertPoint)
       return true;
 
     BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint());
diff --git a/lib/Transforms/Utils/LCSSA.cpp b/lib/Transforms/Utils/LCSSA.cpp
index b4b2e148dfbb1..9658966779b9a 100644
--- a/lib/Transforms/Utils/LCSSA.cpp
+++ b/lib/Transforms/Utils/LCSSA.cpp
@@ -27,10 +27,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/LCSSA.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -41,6 +42,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PredIteratorCache.h"
 #include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 using namespace llvm;
@@ -52,154 +54,156 @@ STATISTIC(NumLCSSA, "Number of live out of a loop variables");
 /// Return true if the specified block is in the list.
 static bool isExitBlock(BasicBlock *BB,
                         const SmallVectorImpl<BasicBlock *> &ExitBlocks) {
-  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
-    if (ExitBlocks[i] == BB)
-      return true;
-  return false;
+  return find(ExitBlocks, BB) != ExitBlocks.end();
 }
 
-/// Given an instruction in the loop, check to see if it has any uses that are
-/// outside the current loop.  If so, insert LCSSA PHI nodes and rewrite the
-/// uses.
-static bool processInstruction(Loop &L, Instruction &Inst, DominatorTree &DT,
-                               const SmallVectorImpl<BasicBlock *> &ExitBlocks,
-                               PredIteratorCache &PredCache, LoopInfo *LI) {
+/// For every instruction from the worklist, check to see if it has any uses
+/// that are outside the current loop.  If so, insert LCSSA PHI nodes and
+/// rewrite the uses.
+bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
+                                    DominatorTree &DT, LoopInfo &LI) {
   SmallVector<Use *, 16> UsesToRewrite;
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  PredIteratorCache PredCache;
+  bool Changed = false;
 
-  // Tokens cannot be used in PHI nodes, so we skip over them.
-  // We can run into tokens which are live out of a loop with catchswitch
-  // instructions in Windows EH if the catchswitch has one catchpad which
-  // is inside the loop and another which is not.
-  if (Inst.getType()->isTokenTy())
-    return false;
+  while (!Worklist.empty()) {
+    UsesToRewrite.clear();
+    ExitBlocks.clear();
 
-  BasicBlock *InstBB = Inst.getParent();
+    Instruction *I = Worklist.pop_back_val();
+    BasicBlock *InstBB = I->getParent();
+    Loop *L = LI.getLoopFor(InstBB);
+    L->getExitBlocks(ExitBlocks);
 
-  for (Use &U : Inst.uses()) {
-    Instruction *User = cast<Instruction>(U.getUser());
-    BasicBlock *UserBB = User->getParent();
-    if (PHINode *PN = dyn_cast<PHINode>(User))
-      UserBB = PN->getIncomingBlock(U);
+    if (ExitBlocks.empty())
+      continue;
 
-    if (InstBB != UserBB && !L.contains(UserBB))
-      UsesToRewrite.push_back(&U);
-  }
+    // Tokens cannot be used in PHI nodes, so we skip over them.
+    // We can run into tokens which are live out of a loop with catchswitch
+    // instructions in Windows EH if the catchswitch has one catchpad which
+    // is inside the loop and another which is not.
+    if (I->getType()->isTokenTy())
+      continue;
 
-  // If there are no uses outside the loop, exit with no change.
-  if (UsesToRewrite.empty())
-    return false;
+    for (Use &U : I->uses()) {
+      Instruction *User = cast<Instruction>(U.getUser());
+      BasicBlock *UserBB = User->getParent();
+      if (PHINode *PN = dyn_cast<PHINode>(User))
+        UserBB = PN->getIncomingBlock(U);
 
-  ++NumLCSSA; // We are applying the transformation
+      if (InstBB != UserBB && !L->contains(UserBB))
+        UsesToRewrite.push_back(&U);
+    }
 
-  // Invoke instructions are special in that their result value is not available
-  // along their unwind edge. The code below tests to see whether DomBB
-  // dominates the value, so adjust DomBB to the normal destination block,
-  // which is effectively where the value is first usable.
-  BasicBlock *DomBB = Inst.getParent();
-  if (InvokeInst *Inv = dyn_cast<InvokeInst>(&Inst))
-    DomBB = Inv->getNormalDest();
+    // If there are no uses outside the loop, exit with no change.
+    if (UsesToRewrite.empty())
+      continue;
 
-  DomTreeNode *DomNode = DT.getNode(DomBB);
+    ++NumLCSSA; // We are applying the transformation
 
-  SmallVector<PHINode *, 16> AddedPHIs;
-  SmallVector<PHINode *, 8> PostProcessPHIs;
+    // Invoke instructions are special in that their result value is not
+    // available along their unwind edge. The code below tests to see whether
+    // DomBB dominates the value, so adjust DomBB to the normal destination
+    // block, which is effectively where the value is first usable.
+    BasicBlock *DomBB = InstBB;
+    if (InvokeInst *Inv = dyn_cast<InvokeInst>(I))
+      DomBB = Inv->getNormalDest();
 
-  SSAUpdater SSAUpdate;
-  SSAUpdate.Initialize(Inst.getType(), Inst.getName());
+    DomTreeNode *DomNode = DT.getNode(DomBB);
 
-  // Insert the LCSSA phi's into all of the exit blocks dominated by the
-  // value, and add them to the Phi's map.
-  for (BasicBlock *ExitBB : ExitBlocks) {
-    if (!DT.dominates(DomNode, DT.getNode(ExitBB)))
-      continue;
+    SmallVector<PHINode *, 16> AddedPHIs;
+    SmallVector<PHINode *, 8> PostProcessPHIs;
 
-    // If we already inserted something for this BB, don't reprocess it.
-    if (SSAUpdate.HasValueForBlock(ExitBB))
-      continue;
+    SSAUpdater SSAUpdate;
+    SSAUpdate.Initialize(I->getType(), I->getName());
 
-    PHINode *PN = PHINode::Create(Inst.getType(), PredCache.size(ExitBB),
-                                  Inst.getName() + ".lcssa", &ExitBB->front());
+    // Insert the LCSSA phi's into all of the exit blocks dominated by the
+    // value, and add them to the Phi's map.
+    for (BasicBlock *ExitBB : ExitBlocks) {
+      if (!DT.dominates(DomNode, DT.getNode(ExitBB)))
+        continue;
 
-    // Add inputs from inside the loop for this PHI.
-    for (BasicBlock *Pred : PredCache.get(ExitBB)) {
-      PN->addIncoming(&Inst, Pred);
+      // If we already inserted something for this BB, don't reprocess it.
+      if (SSAUpdate.HasValueForBlock(ExitBB))
+        continue;
 
-      // If the exit block has a predecessor not within the loop, arrange for
-      // the incoming value use corresponding to that predecessor to be
-      // rewritten in terms of a different LCSSA PHI.
-      if (!L.contains(Pred))
-        UsesToRewrite.push_back(
-            &PN->getOperandUse(PN->getOperandNumForIncomingValue(
-                 PN->getNumIncomingValues() - 1)));
+      PHINode *PN = PHINode::Create(I->getType(), PredCache.size(ExitBB),
+                                    I->getName() + ".lcssa", &ExitBB->front());
+
+      // Add inputs from inside the loop for this PHI.
+      for (BasicBlock *Pred : PredCache.get(ExitBB)) {
+        PN->addIncoming(I, Pred);
+
+        // If the exit block has a predecessor not within the loop, arrange for
+        // the incoming value use corresponding to that predecessor to be
+        // rewritten in terms of a different LCSSA PHI.
+        if (!L->contains(Pred))
+          UsesToRewrite.push_back(
+              &PN->getOperandUse(PN->getOperandNumForIncomingValue(
+                  PN->getNumIncomingValues() - 1)));
+      }
+
+      AddedPHIs.push_back(PN);
+
+      // Remember that this phi makes the value alive in this block.
+      SSAUpdate.AddAvailableValue(ExitBB, PN);
+
+      // LoopSimplify might fail to simplify some loops (e.g. when indirect
+      // branches are involved). In such situations, it might happen that an
+      // exit for Loop L1 is the header of a disjoint Loop L2. Thus, when we
+      // create PHIs in such an exit block, we are also inserting PHIs into L2's
+      // header. This could break LCSSA form for L2 because these inserted PHIs
+      // can also have uses outside of L2. Remember all PHIs in such situation
+      // as to revisit than later on. FIXME: Remove this if indirectbr support
+      // into LoopSimplify gets improved.
+      if (auto *OtherLoop = LI.getLoopFor(ExitBB))
+        if (!L->contains(OtherLoop))
+          PostProcessPHIs.push_back(PN);
     }
 
-    AddedPHIs.push_back(PN);
-
-    // Remember that this phi makes the value alive in this block.
-    SSAUpdate.AddAvailableValue(ExitBB, PN);
-
-    // LoopSimplify might fail to simplify some loops (e.g. when indirect
-    // branches are involved). In such situations, it might happen that an exit
-    // for Loop L1 is the header of a disjoint Loop L2. Thus, when we create
-    // PHIs in such an exit block, we are also inserting PHIs into L2's header.
-    // This could break LCSSA form for L2 because these inserted PHIs can also
-    // have uses outside of L2. Remember all PHIs in such situation as to
-    // revisit than later on. FIXME: Remove this if indirectbr support into
-    // LoopSimplify gets improved.
-    if (auto *OtherLoop = LI->getLoopFor(ExitBB))
-      if (!L.contains(OtherLoop))
-        PostProcessPHIs.push_back(PN);
-  }
+    // Rewrite all uses outside the loop in terms of the new PHIs we just
+    // inserted.
+    for (Use *UseToRewrite : UsesToRewrite) {
+      // If this use is in an exit block, rewrite to use the newly inserted PHI.
+      // This is required for correctness because SSAUpdate doesn't handle uses
+      // in the same block.  It assumes the PHI we inserted is at the end of the
+      // block.
+      Instruction *User = cast<Instruction>(UseToRewrite->getUser());
+      BasicBlock *UserBB = User->getParent();
+      if (PHINode *PN = dyn_cast<PHINode>(User))
+        UserBB = PN->getIncomingBlock(*UseToRewrite);
+
+      if (isa<PHINode>(UserBB->begin()) && isExitBlock(UserBB, ExitBlocks)) {
+        // Tell the VHs that the uses changed. This updates SCEV's caches.
+        if (UseToRewrite->get()->hasValueHandle())
+          ValueHandleBase::ValueIsRAUWd(*UseToRewrite, &UserBB->front());
+        UseToRewrite->set(&UserBB->front());
+        continue;
+      }
 
-  // Rewrite all uses outside the loop in terms of the new PHIs we just
-  // inserted.
-  for (Use *UseToRewrite : UsesToRewrite) {
-    // If this use is in an exit block, rewrite to use the newly inserted PHI.
-    // This is required for correctness because SSAUpdate doesn't handle uses in
-    // the same block.  It assumes the PHI we inserted is at the end of the
-    // block.
-    Instruction *User = cast<Instruction>(UseToRewrite->getUser());
-    BasicBlock *UserBB = User->getParent();
-    if (PHINode *PN = dyn_cast<PHINode>(User))
-      UserBB = PN->getIncomingBlock(*UseToRewrite);
-
-    if (isa<PHINode>(UserBB->begin()) && isExitBlock(UserBB, ExitBlocks)) {
-      // Tell the VHs that the uses changed. This updates SCEV's caches.
-      if (UseToRewrite->get()->hasValueHandle())
-        ValueHandleBase::ValueIsRAUWd(*UseToRewrite, &UserBB->front());
-      UseToRewrite->set(&UserBB->front());
-      continue;
+      // Otherwise, do full PHI insertion.
+      SSAUpdate.RewriteUse(*UseToRewrite);
     }
 
-    // Otherwise, do full PHI insertion.
-    SSAUpdate.RewriteUse(*UseToRewrite);
-  }
+    // Post process PHI instructions that were inserted into another disjoint
+    // loop and update their exits properly.
+    for (auto *PostProcessPN : PostProcessPHIs) {
+      if (PostProcessPN->use_empty())
+        continue;
 
-  // Post process PHI instructions that were inserted into another disjoint loop
-  // and update their exits properly.
-  for (auto *I : PostProcessPHIs) {
-    if (I->use_empty())
-      continue;
+      // Reprocess each PHI instruction.
+      Worklist.push_back(PostProcessPN);
+    }
 
-    BasicBlock *PHIBB = I->getParent();
-    Loop *OtherLoop = LI->getLoopFor(PHIBB);
-    SmallVector<BasicBlock *, 8> EBs;
-    OtherLoop->getExitBlocks(EBs);
-    if (EBs.empty())
-      continue;
+    // Remove PHI nodes that did not have any uses rewritten.
+    for (PHINode *PN : AddedPHIs)
+      if (PN->use_empty())
+        PN->eraseFromParent();
 
-    // Recurse and re-process each PHI instruction. FIXME: we should really
-    // convert this entire thing to a worklist approach where we process a
-    // vector of instructions...
-    processInstruction(*OtherLoop, *I, DT, EBs, PredCache, LI);
+    Changed = true;
   }
-
-  // Remove PHI nodes that did not have any uses rewritten.
-  for (PHINode *PN : AddedPHIs)
-    if (PN->use_empty())
-      PN->eraseFromParent();
-
-  return true;
+  return Changed;
 }
 
 /// Return true if the specified block dominates at least
@@ -209,11 +213,9 @@ blockDominatesAnExit(BasicBlock *BB,
                      DominatorTree &DT,
                      const SmallVectorImpl<BasicBlock *> &ExitBlocks) {
   DomTreeNode *DomNode = DT.getNode(BB);
-  for (BasicBlock *ExitBB : ExitBlocks)
-    if (DT.dominates(DomNode, DT.getNode(ExitBB)))
-      return true;
-
-  return false;
+  return llvm::any_of(ExitBlocks, [&](BasicBlock * EB) {
+    return DT.dominates(DomNode, DT.getNode(EB));
+  });
 }
 
 bool llvm::formLCSSA(Loop &L, DominatorTree &DT, LoopInfo *LI,
@@ -227,10 +229,10 @@ bool llvm::formLCSSA(Loop &L, DominatorTree &DT, LoopInfo *LI,
   if (ExitBlocks.empty())
     return false;
 
-  PredIteratorCache PredCache;
+  SmallVector<Instruction *, 8> Worklist;
 
   // Look at all the instructions in the loop, checking to see if they have uses
-  // outside the loop.  If so, rewrite those uses.
+  // outside the loop.  If so, put them into the worklist to rewrite those uses.
   for (BasicBlock *BB : L.blocks()) {
     // For large loops, avoid use-scanning by using dominance information:  In
     // particular, if a block does not dominate any of the loop exits, then none
@@ -246,9 +248,10 @@ bool llvm::formLCSSA(Loop &L, DominatorTree &DT, LoopInfo *LI,
            !isa<PHINode>(I.user_back())))
         continue;
 
-      Changed |= processInstruction(L, I, DT, ExitBlocks, PredCache, LI);
+      Worklist.push_back(&I);
     }
   }
+  Changed = formLCSSAForInstructions(Worklist, DT, *LI);
 
   // If we modified the code, remove any caches about the loop from SCEV to
   // avoid dangling entries.
@@ -274,11 +277,20 @@ bool llvm::formLCSSARecursively(Loop &L, DominatorTree &DT, LoopInfo *LI,
   return Changed;
 }
 
+/// Process all loops in the function, inner-most out.
+static bool formLCSSAOnAllLoops(LoopInfo *LI, DominatorTree &DT,
+                                ScalarEvolution *SE) {
+  bool Changed = false;
+  for (auto &L : *LI)
+    Changed |= formLCSSARecursively(*L, DT, LI, SE);
+  return Changed;
+}
+
 namespace {
-struct LCSSA : public FunctionPass {
+struct LCSSAWrapperPass : public FunctionPass {
   static char ID; // Pass identification, replacement for typeid
-  LCSSA() : FunctionPass(ID) {
-    initializeLCSSAPass(*PassRegistry::getPassRegistry());
+  LCSSAWrapperPass() : FunctionPass(ID) {
+    initializeLCSSAWrapperPassPass(*PassRegistry::getPassRegistry());
   }
 
   // Cached analysis information for the current function.
@@ -298,6 +310,7 @@ struct LCSSA : public FunctionPass {
     AU.addRequired<LoopInfoWrapperPass>();
     AU.addPreservedID(LoopSimplifyID);
     AU.addPreserved<AAResultsWrapperPass>();
+    AU.addPreserved<BasicAAWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
     AU.addPreserved<ScalarEvolutionWrapperPass>();
     AU.addPreserved<SCEVAAWrapperPass>();
@@ -305,30 +318,39 @@ struct LCSSA : public FunctionPass {
 };
 }
 
-char LCSSA::ID = 0;
-INITIALIZE_PASS_BEGIN(LCSSA, "lcssa", "Loop-Closed SSA Form Pass", false, false)
+char LCSSAWrapperPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LCSSAWrapperPass, "lcssa", "Loop-Closed SSA Form Pass",
+                      false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
-INITIALIZE_PASS_END(LCSSA, "lcssa", "Loop-Closed SSA Form Pass", false, false)
-
-Pass *llvm::createLCSSAPass() { return new LCSSA(); }
-char &llvm::LCSSAID = LCSSA::ID;
+INITIALIZE_PASS_END(LCSSAWrapperPass, "lcssa", "Loop-Closed SSA Form Pass",
+                    false, false)
 
+Pass *llvm::createLCSSAPass() { return new LCSSAWrapperPass(); }
+char &llvm::LCSSAID = LCSSAWrapperPass::ID;
 
-/// Process all loops in the function, inner-most out.
-bool LCSSA::runOnFunction(Function &F) {
-  bool Changed = false;
+/// Transform \p F into loop-closed SSA form.
+bool LCSSAWrapperPass::runOnFunction(Function &F) {
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
   SE = SEWP ? &SEWP->getSE() : nullptr;
 
-  // Simplify each loop nest in the function.
-  for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
-    Changed |= formLCSSARecursively(**I, *DT, LI, SE);
-
-  return Changed;
+  return formLCSSAOnAllLoops(LI, *DT, SE);
 }
 
+PreservedAnalyses LCSSAPass::run(Function &F, AnalysisManager<Function> &AM) {
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto *SE = AM.getCachedResult<ScalarEvolutionAnalysis>(F);
+  if (!formLCSSAOnAllLoops(&LI, DT, SE))
+    return PreservedAnalyses::all();
+
+  // FIXME: This should also 'preserve the CFG'.
+  PreservedAnalyses PA;
+  PA.preserve<BasicAA>();
+  PA.preserve<GlobalsAA>();
+  PA.preserve<SCEVAA>();
+  PA.preserve<ScalarEvolutionAnalysis>();
+  return PA;
+}
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index abc9b65f7a394..f1838d891466e 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -42,11 +42,13 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
+using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "local"
 
@@ -148,9 +150,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
           SmallVector<uint32_t, 8> Weights;
           for (unsigned MD_i = 1, MD_e = MD->getNumOperands(); MD_i < MD_e;
                ++MD_i) {
-            ConstantInt *CI =
-                mdconst::dyn_extract<ConstantInt>(MD->getOperand(MD_i));
-            assert(CI);
+            auto *CI = mdconst::extract<ConstantInt>(MD->getOperand(MD_i));
             Weights.push_back(CI->getValue().getZExtValue());
           }
           // Merge weight of this case to the default weight.
@@ -321,8 +321,12 @@ bool llvm::isInstructionTriviallyDead(Instruction *I,
         II->getIntrinsicID() == Intrinsic::lifetime_end)
       return isa<UndefValue>(II->getArgOperand(1));
 
-    // Assumptions are dead if their condition is trivially true.
-    if (II->getIntrinsicID() == Intrinsic::assume) {
+    // Assumptions are dead if their condition is trivially true.  Guards on
+    // true are operationally no-ops.  In the future we can consider more
+    // sophisticated tradeoffs for guards considering potential for check
+    // widening, but for now we keep things simple.
+    if (II->getIntrinsicID() == Intrinsic::assume ||
+        II->getIntrinsicID() == Intrinsic::experimental_guard) {
       if (ConstantInt *Cond = dyn_cast<ConstantInt>(II->getArgOperand(0)))
         return !Cond->isZero();
 
@@ -452,14 +456,23 @@ simplifyAndDCEInstruction(Instruction *I,
   if (Value *SimpleV = SimplifyInstruction(I, DL)) {
     // Add the users to the worklist. CAREFUL: an instruction can use itself,
     // in the case of a phi node.
-    for (User *U : I->users())
-      if (U != I)
+    for (User *U : I->users()) {
+      if (U != I) {
         WorkList.insert(cast<Instruction>(U));
+      }
+    }
 
     // Replace the instruction with its simplified value.
-    I->replaceAllUsesWith(SimpleV);
-    I->eraseFromParent();
-    return true;
+    bool Changed = false;
+    if (!I->use_empty()) {
+      I->replaceAllUsesWith(SimpleV);
+      Changed = true;
+    }
+    if (isInstructionTriviallyDead(I, TLI)) {
+      I->eraseFromParent();
+      Changed = true;
+    }
+    return Changed;
   }
   return false;
 }
@@ -486,7 +499,8 @@ bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB,
   // Iterate over the original function, only adding insts to the worklist
   // if they actually need to be revisited. This avoids having to pre-init
   // the worklist with the entire function's worth of instructions.
-  for (BasicBlock::iterator BI = BB->begin(), E = std::prev(BB->end()); BI != E;) {
+  for (BasicBlock::iterator BI = BB->begin(), E = std::prev(BB->end());
+       BI != E;) {
     assert(!BI->isTerminator());
     Instruction *I = &*BI;
     ++BI;
@@ -1025,7 +1039,8 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
 ///
 
 /// See if there is a dbg.value intrinsic for DIVar before I.
-static bool LdStHasDebugValue(const DILocalVariable *DIVar, Instruction *I) {
+static bool LdStHasDebugValue(DILocalVariable *DIVar, DIExpression *DIExpr,
+                              Instruction *I) {
   // Since we can't guarantee that the original dbg.declare instrinsic
   // is removed by LowerDbgDeclare(), we need to make sure that we are
   // not inserting the same dbg.value intrinsic over and over.
@@ -1035,7 +1050,8 @@ static bool LdStHasDebugValue(const DILocalVariable *DIVar, Instruction *I) {
     if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(PrevI))
       if (DVI->getValue() == I->getOperand(0) &&
           DVI->getOffset() == 0 &&
-          DVI->getVariable() == DIVar)
+          DVI->getVariable() == DIVar &&
+          DVI->getExpression() == DIExpr)
         return true;
   }
   return false;
@@ -1049,9 +1065,6 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
   auto *DIExpr = DDI->getExpression();
   assert(DIVar && "Missing variable");
 
-  if (LdStHasDebugValue(DIVar, SI))
-    return true;
-
   // If an argument is zero extended then use argument directly. The ZExt
   // may be zapped by an optimization pass in future.
   Argument *ExtendedArg = nullptr;
@@ -1066,25 +1079,25 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
     // to the alloca described by DDI, if it's first operand is an extend,
     // we're guaranteed that before extension, the value was narrower than
     // the size of the alloca, hence the size of the described variable.
-    SmallVector<uint64_t, 3> NewDIExpr;
+    SmallVector<uint64_t, 3> Ops;
     unsigned PieceOffset = 0;
     // If this already is a bit piece, we drop the bit piece from the expression
     // and record the offset.
     if (DIExpr->isBitPiece()) {
-      NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end()-3);
+      Ops.append(DIExpr->elements_begin(), DIExpr->elements_end()-3);
       PieceOffset = DIExpr->getBitPieceOffset();
     } else {
-      NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end());
+      Ops.append(DIExpr->elements_begin(), DIExpr->elements_end());
     }
-    NewDIExpr.push_back(dwarf::DW_OP_bit_piece);
-    NewDIExpr.push_back(PieceOffset); //Offset
+    Ops.push_back(dwarf::DW_OP_bit_piece);
+    Ops.push_back(PieceOffset); // Offset
     const DataLayout &DL = DDI->getModule()->getDataLayout();
-    NewDIExpr.push_back(DL.getTypeSizeInBits(ExtendedArg->getType())); // Size
-    Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar,
-                                    Builder.createExpression(NewDIExpr),
-                                    DDI->getDebugLoc(), SI);
-  }
-  else
+    Ops.push_back(DL.getTypeSizeInBits(ExtendedArg->getType())); // Size
+    auto NewDIExpr = Builder.createExpression(Ops);
+    if (!LdStHasDebugValue(DIVar, NewDIExpr, SI))
+      Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar, NewDIExpr,
+                                      DDI->getDebugLoc(), SI);
+  } else if (!LdStHasDebugValue(DIVar, DIExpr, SI))
     Builder.insertDbgValueIntrinsic(SI->getOperand(0), 0, DIVar, DIExpr,
                                     DDI->getDebugLoc(), SI);
   return true;
@@ -1098,7 +1111,7 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
   auto *DIExpr = DDI->getExpression();
   assert(DIVar && "Missing variable");
 
-  if (LdStHasDebugValue(DIVar, LI))
+  if (LdStHasDebugValue(DIVar, DIExpr, LI))
     return true;
 
   // We are now tracking the loaded value instead of the address. In the
@@ -1140,12 +1153,14 @@ bool llvm::LowerDbgDeclare(Function &F) {
     // the stack slot (and at a lexical-scope granularity). Later
     // passes will attempt to elide the stack slot.
     if (AI && !isArray(AI)) {
-      for (User *U : AI->users())
-        if (StoreInst *SI = dyn_cast<StoreInst>(U))
-          ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
-        else if (LoadInst *LI = dyn_cast<LoadInst>(U))
+      for (auto &AIUse : AI->uses()) {
+        User *U = AIUse.getUser();
+        if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+          if (AIUse.getOperandNo() == 1)
+            ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
+        } else if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
           ConvertDebugDeclareToDebugValue(DDI, LI, DIB);
-        else if (CallInst *CI = dyn_cast<CallInst>(U)) {
+        } else if (CallInst *CI = dyn_cast<CallInst>(U)) {
           // This is a call by-value or some other instruction that
           // takes a pointer to the variable. Insert a *value*
           // intrinsic that describes the alloca.
@@ -1157,6 +1172,7 @@ bool llvm::LowerDbgDeclare(Function &F) {
                                       DIB.createExpression(NewDIExpr),
                                       DDI->getDebugLoc(), CI);
         }
+      }
       DDI->eraseFromParent();
     }
   }
@@ -1175,6 +1191,38 @@ DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) {
   return nullptr;
 }
 
+static void DIExprAddDeref(SmallVectorImpl<uint64_t> &Expr) {
+  Expr.push_back(dwarf::DW_OP_deref);
+}
+
+static void DIExprAddOffset(SmallVectorImpl<uint64_t> &Expr, int Offset) {
+  if (Offset > 0) {
+    Expr.push_back(dwarf::DW_OP_plus);
+    Expr.push_back(Offset);
+  } else if (Offset < 0) {
+    Expr.push_back(dwarf::DW_OP_minus);
+    Expr.push_back(-Offset);
+  }
+}
+
+static DIExpression *BuildReplacementDIExpr(DIBuilder &Builder,
+                                            DIExpression *DIExpr, bool Deref,
+                                            int Offset) {
+  if (!Deref && !Offset)
+    return DIExpr;
+  // Create a copy of the original DIDescriptor for user variable, prepending
+  // "deref" operation to a list of address elements, as new llvm.dbg.declare
+  // will take a value storing address of the memory for variable, not
+  // alloca itself.
+  SmallVector<uint64_t, 4> NewDIExpr;
+  if (Deref)
+    DIExprAddDeref(NewDIExpr);
+  DIExprAddOffset(NewDIExpr, Offset);
+  if (DIExpr)
+    NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end());
+  return Builder.createExpression(NewDIExpr);
+}
+
 bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
                              Instruction *InsertBefore, DIBuilder &Builder,
                              bool Deref, int Offset) {
@@ -1186,25 +1234,7 @@ bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
   auto *DIExpr = DDI->getExpression();
   assert(DIVar && "Missing variable");
 
-  if (Deref || Offset) {
-    // Create a copy of the original DIDescriptor for user variable, prepending
-    // "deref" operation to a list of address elements, as new llvm.dbg.declare
-    // will take a value storing address of the memory for variable, not
-    // alloca itself.
-    SmallVector<uint64_t, 4> NewDIExpr;
-    if (Deref)
-      NewDIExpr.push_back(dwarf::DW_OP_deref);
-    if (Offset > 0) {
-      NewDIExpr.push_back(dwarf::DW_OP_plus);
-      NewDIExpr.push_back(Offset);
-    } else if (Offset < 0) {
-      NewDIExpr.push_back(dwarf::DW_OP_minus);
-      NewDIExpr.push_back(-Offset);
-    }
-    if (DIExpr)
-      NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end());
-    DIExpr = Builder.createExpression(NewDIExpr);
-  }
+  DIExpr = BuildReplacementDIExpr(Builder, DIExpr, Deref, Offset);
 
   // Insert llvm.dbg.declare immediately after the original alloca, and remove
   // old llvm.dbg.declare.
@@ -1219,12 +1249,73 @@ bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
                            Deref, Offset);
 }
 
-void llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap) {
+static void replaceOneDbgValueForAlloca(DbgValueInst *DVI, Value *NewAddress,
+                                        DIBuilder &Builder, int Offset) {
+  DebugLoc Loc = DVI->getDebugLoc();
+  auto *DIVar = DVI->getVariable();
+  auto *DIExpr = DVI->getExpression();
+  assert(DIVar && "Missing variable");
+
+  // This is an alloca-based llvm.dbg.value. The first thing it should do with
+  // the alloca pointer is dereference it. Otherwise we don't know how to handle
+  // it and give up.
+  if (!DIExpr || DIExpr->getNumElements() < 1 ||
+      DIExpr->getElement(0) != dwarf::DW_OP_deref)
+    return;
+
+  // Insert the offset immediately after the first deref.
+  // We could just change the offset argument of dbg.value, but it's unsigned...
+  if (Offset) {
+    SmallVector<uint64_t, 4> NewDIExpr;
+    DIExprAddDeref(NewDIExpr);
+    DIExprAddOffset(NewDIExpr, Offset);
+    NewDIExpr.append(DIExpr->elements_begin() + 1, DIExpr->elements_end());
+    DIExpr = Builder.createExpression(NewDIExpr);
+  }
+
+  Builder.insertDbgValueIntrinsic(NewAddress, DVI->getOffset(), DIVar, DIExpr,
+                                  Loc, DVI);
+  DVI->eraseFromParent();
+}
+
+void llvm::replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
+                                    DIBuilder &Builder, int Offset) {
+  if (auto *L = LocalAsMetadata::getIfExists(AI))
+    if (auto *MDV = MetadataAsValue::getIfExists(AI->getContext(), L))
+      for (auto UI = MDV->use_begin(), UE = MDV->use_end(); UI != UE;) {
+        Use &U = *UI++;
+        if (auto *DVI = dyn_cast<DbgValueInst>(U.getUser()))
+          replaceOneDbgValueForAlloca(DVI, NewAllocaAddress, Builder, Offset);
+      }
+}
+
+unsigned llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
+  unsigned NumDeadInst = 0;
+  // Delete the instructions backwards, as it has a reduced likelihood of
+  // having to update as many def-use and use-def chains.
+  Instruction *EndInst = BB->getTerminator(); // Last not to be deleted.
+  while (EndInst != &BB->front()) {
+    // Delete the next to last instruction.
+    Instruction *Inst = &*--EndInst->getIterator();
+    if (!Inst->use_empty() && !Inst->getType()->isTokenTy())
+      Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
+    if (Inst->isEHPad() || Inst->getType()->isTokenTy()) {
+      EndInst = Inst;
+      continue;
+    }
+    if (!isa<DbgInfoIntrinsic>(Inst))
+      ++NumDeadInst;
+    Inst->eraseFromParent();
+  }
+  return NumDeadInst;
+}
+
+unsigned llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap) {
   BasicBlock *BB = I->getParent();
   // Loop over all of the successors, removing BB's entry from any PHI
   // nodes.
-  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
-    (*SI)->removePredecessor(BB);
+  for (BasicBlock *Successor : successors(BB))
+    Successor->removePredecessor(BB);
 
   // Insert a call to llvm.trap right before this.  This turns the undefined
   // behavior into a hard fail instead of falling through into random code.
@@ -1237,12 +1328,15 @@ void llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap) {
   new UnreachableInst(I->getContext(), I);
 
   // All instructions after this are dead.
+  unsigned NumInstrsRemoved = 0;
   BasicBlock::iterator BBI = I->getIterator(), BBE = BB->end();
   while (BBI != BBE) {
     if (!BBI->use_empty())
       BBI->replaceAllUsesWith(UndefValue::get(BBI->getType()));
     BB->getInstList().erase(BBI++);
+    ++NumInstrsRemoved;
   }
+  return NumInstrsRemoved;
 }
 
 /// changeToCall - Convert the specified invoke into a normal call.
@@ -1280,36 +1374,52 @@ static bool markAliveBlocks(Function &F,
     // Do a quick scan of the basic block, turning any obviously unreachable
     // instructions into LLVM unreachable insts.  The instruction combining pass
     // canonicalizes unreachable insts into stores to null or undef.
-    for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;++BBI){
+    for (Instruction &I : *BB) {
       // Assumptions that are known to be false are equivalent to unreachable.
       // Also, if the condition is undefined, then we make the choice most
       // beneficial to the optimizer, and choose that to also be unreachable.
-      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(BBI))
+      if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
         if (II->getIntrinsicID() == Intrinsic::assume) {
-          bool MakeUnreachable = false;
-          if (isa<UndefValue>(II->getArgOperand(0)))
-            MakeUnreachable = true;
-          else if (ConstantInt *Cond =
-                   dyn_cast<ConstantInt>(II->getArgOperand(0)))
-            MakeUnreachable = Cond->isZero();
-
-          if (MakeUnreachable) {
+          if (match(II->getArgOperand(0), m_CombineOr(m_Zero(), m_Undef()))) {
             // Don't insert a call to llvm.trap right before the unreachable.
-            changeToUnreachable(&*BBI, false);
+            changeToUnreachable(II, false);
             Changed = true;
             break;
           }
         }
 
-      if (CallInst *CI = dyn_cast<CallInst>(BBI)) {
+        if (II->getIntrinsicID() == Intrinsic::experimental_guard) {
+          // A call to the guard intrinsic bails out of the current compilation
+          // unit if the predicate passed to it is false.  If the predicate is a
+          // constant false, then we know the guard will bail out of the current
+          // compile unconditionally, so all code following it is dead.
+          //
+          // Note: unlike in llvm.assume, it is not "obviously profitable" for
+          // guards to treat `undef` as `false` since a guard on `undef` can
+          // still be useful for widening.
+          if (match(II->getArgOperand(0), m_Zero()))
+            if (!isa<UnreachableInst>(II->getNextNode())) {
+              changeToUnreachable(II->getNextNode(), /*UseLLVMTrap=*/ false);
+              Changed = true;
+              break;
+            }
+        }
+      }
+
+      if (auto *CI = dyn_cast<CallInst>(&I)) {
+        Value *Callee = CI->getCalledValue();
+        if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
+          changeToUnreachable(CI, /*UseLLVMTrap=*/false);
+          Changed = true;
+          break;
+        }
         if (CI->doesNotReturn()) {
           // If we found a call to a no-return function, insert an unreachable
           // instruction after it.  Make sure there isn't *already* one there
           // though.
-          ++BBI;
-          if (!isa<UnreachableInst>(BBI)) {
+          if (!isa<UnreachableInst>(CI->getNextNode())) {
             // Don't insert a call to llvm.trap right before the unreachable.
-            changeToUnreachable(&*BBI, false);
+            changeToUnreachable(CI->getNextNode(), false);
             Changed = true;
           }
           break;
@@ -1319,7 +1429,7 @@ static bool markAliveBlocks(Function &F,
       // Store to undef and store to null are undefined and used to signal that
       // they should be changed to unreachable by passes that can't modify the
       // CFG.
-      if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
+      if (auto *SI = dyn_cast<StoreInst>(&I)) {
         // Don't touch volatile stores.
         if (SI->isVolatile()) continue;
 
@@ -1393,9 +1503,9 @@ static bool markAliveBlocks(Function &F,
     }
 
     Changed |= ConstantFoldTerminator(BB, true);
-    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
-      if (Reachable.insert(*SI).second)
-        Worklist.push_back(*SI);
+    for (BasicBlock *Successor : successors(BB))
+      if (Reachable.insert(Successor).second)
+        Worklist.push_back(Successor);
   } while (!Worklist.empty());
   return Changed;
 }
@@ -1438,7 +1548,7 @@ void llvm::removeUnwindEdge(BasicBlock *BB) {
 /// if they are in a dead cycle.  Return true if a change was made, false
 /// otherwise.
 bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI) {
-  SmallPtrSet<BasicBlock*, 128> Reachable;
+  SmallPtrSet<BasicBlock*, 16> Reachable;
   bool Changed = markAliveBlocks(F, Reachable);
 
   // If there are unreachable blocks in the CFG...
@@ -1454,10 +1564,9 @@ bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI) {
     if (Reachable.count(&*BB))
       continue;
 
-    for (succ_iterator SI = succ_begin(&*BB), SE = succ_end(&*BB); SI != SE;
-         ++SI)
-      if (Reachable.count(*SI))
-        (*SI)->removePredecessor(&*BB);
+    for (BasicBlock *Successor : successors(&*BB))
+      if (Reachable.count(Successor))
+        Successor->removePredecessor(&*BB);
     if (LVI)
       LVI->eraseBlock(&*BB);
     BB->dropAllReferences();
@@ -1495,6 +1604,7 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J,
         K->setMetadata(Kind, MDNode::getMostGenericAliasScope(JMD, KMD));
         break;
       case LLVMContext::MD_noalias:
+      case LLVMContext::MD_mem_parallel_loop_access:
         K->setMetadata(Kind, MDNode::intersect(JMD, KMD));
         break;
       case LLVMContext::MD_range:
@@ -1566,7 +1676,7 @@ unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
        UI != UE;) {
     Use &U = *UI++;
     auto *I = cast<Instruction>(U.getUser());
-    if (DT.dominates(BB, I->getParent())) {
+    if (DT.properlyDominates(BB, I->getParent())) {
       U.set(To);
       DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as "
                    << *To << " in " << *U << "\n");
@@ -1577,18 +1687,18 @@ unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
 }
 
 bool llvm::callsGCLeafFunction(ImmutableCallSite CS) {
-  if (isa<IntrinsicInst>(CS.getInstruction()))
-    // Most LLVM intrinsics are things which can never take a safepoint.
-    // As a result, we don't need to have the stack parsable at the
-    // callsite.  This is a highly useful optimization since intrinsic
-    // calls are fairly prevalent, particularly in debug builds.
-    return true;
-
   // Check if the function is specifically marked as a gc leaf function.
   if (CS.hasFnAttr("gc-leaf-function"))
     return true;
-  if (const Function *F = CS.getCalledFunction())
-    return F->hasFnAttribute("gc-leaf-function");
+  if (const Function *F = CS.getCalledFunction()) {
+    if (F->hasFnAttribute("gc-leaf-function"))
+      return true;
+
+    if (auto IID = F->getIntrinsicID())
+      // Most LLVM intrinsics do not take safepoints.
+      return IID != Intrinsic::experimental_gc_statepoint &&
+             IID != Intrinsic::experimental_deoptimize;
+  }
 
   return false;
 }
@@ -1723,7 +1833,23 @@ collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals,
         // If the AndMask is zero for this bit, clear the bit.
         if ((AndMask & Bit) == 0)
           Result->Provenance[i] = BitPart::Unset;
+      return Result;
+    }
 
+    // If this is a zext instruction zero extend the result.
+    if (I->getOpcode() == Instruction::ZExt) {
+      auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps,
+                                  MatchBitReversals, BPS);
+      if (!Res)
+        return Result;
+
+      Result = BitPart(Res->Provider, BitWidth);
+      auto NarrowBitWidth =
+          cast<IntegerType>(cast<ZExtInst>(I)->getSrcTy())->getBitWidth();
+      for (unsigned i = 0; i < NarrowBitWidth; ++i)
+        Result->Provenance[i] = Res->Provenance[i];
+      for (unsigned i = NarrowBitWidth; i < BitWidth; ++i)
+        Result->Provenance[i] = BitPart::Unset;
       return Result;
     }
   }
@@ -1754,7 +1880,7 @@ static bool bitTransformIsCorrectForBitReverse(unsigned From, unsigned To,
 
 /// Given an OR instruction, check to see if this is a bitreverse
 /// idiom. If so, insert the new intrinsic and return true.
-bool llvm::recognizeBitReverseOrBSwapIdiom(
+bool llvm::recognizeBSwapOrBitReverseIdiom(
     Instruction *I, bool MatchBSwaps, bool MatchBitReversals,
     SmallVectorImpl<Instruction *> &InsertedInsts) {
   if (Operator::getOpcode(I) != Instruction::Or)
@@ -1766,6 +1892,15 @@ bool llvm::recognizeBitReverseOrBSwapIdiom(
     return false;   // Can't do vectors or integers > 128 bits.
   unsigned BW = ITy->getBitWidth();
 
+  unsigned DemandedBW = BW;
+  IntegerType *DemandedTy = ITy;
+  if (I->hasOneUse()) {
+    if (TruncInst *Trunc = dyn_cast<TruncInst>(I->user_back())) {
+      DemandedTy = cast<IntegerType>(Trunc->getType());
+      DemandedBW = DemandedTy->getBitWidth();
+    }
+  }
+
   // Try to find all the pieces corresponding to the bswap.
   std::map<Value *, Optional<BitPart>> BPS;
   auto Res = collectBitParts(I, MatchBSwaps, MatchBitReversals, BPS);
@@ -1775,11 +1910,12 @@ bool llvm::recognizeBitReverseOrBSwapIdiom(
 
   // Now, is the bit permutation correct for a bswap or a bitreverse? We can
   // only byteswap values with an even number of bytes.
-  bool OKForBSwap = BW % 16 == 0, OKForBitReverse = true;
-  for (unsigned i = 0; i < BW; ++i) {
-    OKForBSwap &= bitTransformIsCorrectForBSwap(BitProvenance[i], i, BW);
+  bool OKForBSwap = DemandedBW % 16 == 0, OKForBitReverse = true;
+  for (unsigned i = 0; i < DemandedBW; ++i) {
+    OKForBSwap &=
+        bitTransformIsCorrectForBSwap(BitProvenance[i], i, DemandedBW);
     OKForBitReverse &=
-        bitTransformIsCorrectForBitReverse(BitProvenance[i], i, BW);
+        bitTransformIsCorrectForBitReverse(BitProvenance[i], i, DemandedBW);
   }
 
   Intrinsic::ID Intrin;
@@ -1790,7 +1926,51 @@ bool llvm::recognizeBitReverseOrBSwapIdiom(
   else
     return false;
 
+  if (ITy != DemandedTy) {
+    Function *F = Intrinsic::getDeclaration(I->getModule(), Intrin, DemandedTy);
+    Value *Provider = Res->Provider;
+    IntegerType *ProviderTy = cast<IntegerType>(Provider->getType());
+    // We may need to truncate the provider.
+    if (DemandedTy != ProviderTy) {
+      auto *Trunc = CastInst::Create(Instruction::Trunc, Provider, DemandedTy,
+                                     "trunc", I);
+      InsertedInsts.push_back(Trunc);
+      Provider = Trunc;
+    }
+    auto *CI = CallInst::Create(F, Provider, "rev", I);
+    InsertedInsts.push_back(CI);
+    auto *ExtInst = CastInst::Create(Instruction::ZExt, CI, ITy, "zext", I);
+    InsertedInsts.push_back(ExtInst);
+    return true;
+  }
+
   Function *F = Intrinsic::getDeclaration(I->getModule(), Intrin, ITy);
   InsertedInsts.push_back(CallInst::Create(F, Res->Provider, "rev", I));
   return true;
 }
+
+// CodeGen has special handling for some string functions that may replace
+// them with target-specific intrinsics.  Since that'd skip our interceptors
+// in ASan/MSan/TSan/DFSan, and thus make us miss some memory accesses,
+// we mark affected calls as NoBuiltin, which will disable optimization
+// in CodeGen.
+void llvm::maybeMarkSanitizerLibraryCallNoBuiltin(CallInst *CI,
+                                          const TargetLibraryInfo *TLI) {
+  Function *F = CI->getCalledFunction();
+  LibFunc::Func Func;
+  if (!F || F->hasLocalLinkage() || !F->hasName() ||
+      !TLI->getLibFunc(F->getName(), Func))
+    return;
+  switch (Func) {
+    default: break;
+    case LibFunc::memcmp:
+    case LibFunc::memchr:
+    case LibFunc::strcpy:
+    case LibFunc::stpcpy:
+    case LibFunc::strcmp:
+    case LibFunc::strlen:
+    case LibFunc::strnlen:
+      CI->addAttribute(AttributeSet::FunctionIndex, Attribute::NoBuiltin);
+      break;
+  }
+}
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index 1fa469595d168..b3a928bf77531 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -37,6 +37,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SetOperations.h"
@@ -489,14 +490,9 @@ ReprocessLoop:
       DEBUG(dbgs() << "LoopSimplify: Deleting edge from dead predecessor "
                    << P->getName() << "\n");
 
-      // Inform each successor of each dead pred.
-      for (succ_iterator SI = succ_begin(P), SE = succ_end(P); SI != SE; ++SI)
-        (*SI)->removePredecessor(P);
       // Zap the dead pred's terminator and replace it with unreachable.
       TerminatorInst *TI = P->getTerminator();
-       TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
-      P->getTerminator()->eraseFromParent();
-      new UnreachableInst(P->getContext(), P);
+      changeToUnreachable(TI, /*UseLLVMTrap=*/false);
       Changed = true;
     }
   }
@@ -506,14 +502,13 @@ ReprocessLoop:
   // trip count computations.
   SmallVector<BasicBlock*, 8> ExitingBlocks;
   L->getExitingBlocks(ExitingBlocks);
-  for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(),
-       E = ExitingBlocks.end(); I != E; ++I)
-    if (BranchInst *BI = dyn_cast<BranchInst>((*I)->getTerminator()))
+  for (BasicBlock *ExitingBlock : ExitingBlocks)
+    if (BranchInst *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator()))
       if (BI->isConditional()) {
         if (UndefValue *Cond = dyn_cast<UndefValue>(BI->getCondition())) {
 
           DEBUG(dbgs() << "LoopSimplify: Resolving \"br i1 undef\" to exit in "
-                       << (*I)->getName() << "\n");
+                       << ExitingBlock->getName() << "\n");
 
           BI->setCondition(ConstantInt::get(Cond->getType(),
                                             !L->contains(BI->getSuccessor(0))));
@@ -545,9 +540,7 @@ ReprocessLoop:
 
   SmallSetVector<BasicBlock *, 8> ExitBlockSet(ExitBlocks.begin(),
                                                ExitBlocks.end());
-  for (SmallSetVector<BasicBlock *, 8>::iterator I = ExitBlockSet.begin(),
-         E = ExitBlockSet.end(); I != E; ++I) {
-    BasicBlock *ExitBlock = *I;
+  for (BasicBlock *ExitBlock : ExitBlockSet) {
     for (pred_iterator PI = pred_begin(ExitBlock), PE = pred_end(ExitBlock);
          PI != PE; ++PI)
       // Must be exactly this loop: no subloops, parent loops, or non-loop preds
@@ -691,8 +684,10 @@ ReprocessLoop:
       }
       DT->eraseNode(ExitingBlock);
 
-      BI->getSuccessor(0)->removePredecessor(ExitingBlock);
-      BI->getSuccessor(1)->removePredecessor(ExitingBlock);
+      BI->getSuccessor(0)->removePredecessor(
+          ExitingBlock, /* DontDeleteUselessPHIs */ PreserveLCSSA);
+      BI->getSuccessor(1)->removePredecessor(
+          ExitingBlock, /* DontDeleteUselessPHIs */ PreserveLCSSA);
       ExitingBlock->eraseFromParent();
     }
   }
@@ -731,11 +726,6 @@ namespace {
       initializeLoopSimplifyPass(*PassRegistry::getPassRegistry());
     }
 
-    DominatorTree *DT;
-    LoopInfo *LI;
-    ScalarEvolution *SE;
-    AssumptionCache *AC;
-
     bool runOnFunction(Function &F) override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -753,7 +743,8 @@ namespace {
       AU.addPreserved<GlobalsAAWrapperPass>();
       AU.addPreserved<ScalarEvolutionWrapperPass>();
       AU.addPreserved<SCEVAAWrapperPass>();
-      AU.addPreserved<DependenceAnalysis>();
+      AU.addPreservedID(LCSSAID);
+      AU.addPreserved<DependenceAnalysisWrapperPass>();
       AU.addPreservedID(BreakCriticalEdgesID);  // No critical edges added.
     }
 
@@ -768,9 +759,6 @@ INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify",
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
 INITIALIZE_PASS_END(LoopSimplify, "loop-simplify",
                 "Canonicalize natural loops", false, false)
 
@@ -783,20 +771,64 @@ Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); }
 ///
 bool LoopSimplify::runOnFunction(Function &F) {
   bool Changed = false;
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
-  SE = SEWP ? &SEWP->getSE() : nullptr;
-  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  ScalarEvolution *SE = SEWP ? &SEWP->getSE() : nullptr;
+  AssumptionCache *AC =
+      &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+
   bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+#ifndef NDEBUG
+  if (PreserveLCSSA) {
+    assert(DT && "DT not available.");
+    assert(LI && "LI not available.");
+    bool InLCSSA =
+        all_of(*LI, [&](Loop *L) { return L->isRecursivelyLCSSAForm(*DT); });
+    assert(InLCSSA && "Requested to preserve LCSSA, but it's already broken.");
+  }
+#endif
 
   // Simplify each loop nest in the function.
   for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
     Changed |= simplifyLoop(*I, DT, LI, SE, AC, PreserveLCSSA);
 
+#ifndef NDEBUG
+  if (PreserveLCSSA) {
+    bool InLCSSA =
+        all_of(*LI, [&](Loop *L) { return L->isRecursivelyLCSSAForm(*DT); });
+    assert(InLCSSA && "LCSSA is broken after loop-simplify.");
+  }
+#endif
   return Changed;
 }
 
+PreservedAnalyses LoopSimplifyPass::run(Function &F,
+                                        AnalysisManager<Function> &AM) {
+  bool Changed = false;
+  LoopInfo *LI = &AM.getResult<LoopAnalysis>(F);
+  DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  ScalarEvolution *SE = AM.getCachedResult<ScalarEvolutionAnalysis>(F);
+  AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F);
+
+  // FIXME: This pass should verify that the loops on which it's operating
+  // are in canonical SSA form, and that the pass itself preserves this form.
+  for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
+    Changed |= simplifyLoop(*I, DT, LI, SE, AC, true /* PreserveLCSSA */);
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<LoopAnalysis>();
+  PA.preserve<BasicAA>();
+  PA.preserve<GlobalsAA>();
+  PA.preserve<SCEVAA>();
+  PA.preserve<ScalarEvolutionAnalysis>();
+  PA.preserve<DependenceAnalysis>();
+  return PA;
+}
+
 // FIXME: Restore this code when we re-enable verification in verifyAnalysis
 // below.
 #if 0
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index eea9237ba80c6..7f1f78fa8b411 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -34,6 +34,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
 using namespace llvm;
@@ -44,9 +45,14 @@ using namespace llvm;
 STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled");
 STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)");
 
-/// RemapInstruction - Convert the instruction operands from referencing the
-/// current values into those specified by VMap.
-static inline void RemapInstruction(Instruction *I,
+static cl::opt<bool>
+UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(true), cl::Hidden,
+                    cl::desc("Allow runtime unrolled loops to be unrolled "
+                             "with epilog instead of prolog."));
+
+/// Convert the instruction operands from referencing the current values into
+/// those specified by VMap.
+static inline void remapInstruction(Instruction *I,
                                     ValueToValueMapTy &VMap) {
   for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
     Value *Op = I->getOperand(op);
@@ -64,8 +70,8 @@ static inline void RemapInstruction(Instruction *I,
   }
 }
 
-/// FoldBlockIntoPredecessor - Folds a basic block into its predecessor if it
-/// only has one predecessor, and that predecessor only has one successor.
+/// Folds a basic block into its predecessor if it only has one predecessor, and
+/// that predecessor only has one successor.
 /// The LoopInfo Analysis that is passed will be kept consistent.  If folding is
 /// successful references to the containing loop must be removed from
 /// ScalarEvolution by calling ScalarEvolution::forgetLoop because SE may have
@@ -73,8 +79,9 @@ static inline void RemapInstruction(Instruction *I,
 /// of loops that have already been forgotten to prevent redundant, expensive
 /// calls to ScalarEvolution::forgetLoop.  Returns the new combined block.
 static BasicBlock *
-FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, ScalarEvolution *SE,
-                         SmallPtrSetImpl<Loop *> &ForgottenLoops) {
+foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI, ScalarEvolution *SE,
+                         SmallPtrSetImpl<Loop *> &ForgottenLoops,
+                         DominatorTree *DT) {
   // Merge basic blocks into their predecessor if there is only one distinct
   // pred, and if there is only one distinct successor of the predecessor, and
   // if there are no PHI nodes.
@@ -106,7 +113,16 @@ FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, ScalarEvolution *SE,
   // OldName will be valid until erased.
   StringRef OldName = BB->getName();
 
-  // Erase basic block from the function...
+  // Erase the old block and update dominator info.
+  if (DT)
+    if (DomTreeNode *DTN = DT->getNode(BB)) {
+      DomTreeNode *PredDTN = DT->getNode(OnlyPred);
+      SmallVector<DomTreeNode *, 8> Children(DTN->begin(), DTN->end());
+      for (auto *DI : Children)
+        DT->changeImmediateDominator(DI, PredDTN);
+
+      DT->eraseNode(BB);
+    }
 
   // ScalarEvolution holds references to loop exit blocks.
   if (SE) {
@@ -126,6 +142,35 @@ FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, ScalarEvolution *SE,
   return OnlyPred;
 }
 
+/// Check if unrolling created a situation where we need to insert phi nodes to
+/// preserve LCSSA form.
+/// \param Blocks is a vector of basic blocks representing unrolled loop.
+/// \param L is the outer loop.
+/// It's possible that some of the blocks are in L, and some are not. In this
+/// case, if there is a use is outside L, and definition is inside L, we need to
+/// insert a phi-node, otherwise LCSSA will be broken.
+/// The function is just a helper function for llvm::UnrollLoop that returns
+/// true if this situation occurs, indicating that LCSSA needs to be fixed.
+static bool needToInsertPhisForLCSSA(Loop *L, std::vector<BasicBlock *> Blocks,
+                                     LoopInfo *LI) {
+  for (BasicBlock *BB : Blocks) {
+    if (LI->getLoopFor(BB) == L)
+      continue;
+    for (Instruction &I : *BB) {
+      for (Use &U : I.operands()) {
+        if (auto Def = dyn_cast<Instruction>(U)) {
+          Loop *DefLoop = LI->getLoopFor(Def->getParent());
+          if (!DefLoop)
+            continue;
+          if (DefLoop->contains(L))
+            return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
 /// Unroll the given loop by Count. The loop must be in LCSSA form. Returns true
 /// if unrolling was successful, or false if the loop was unmodified. Unrolling
 /// can only fail when the loop's latch block is not terminated by a conditional
@@ -155,7 +200,7 @@ FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, ScalarEvolution *SE,
 ///
 /// This utility preserves LoopInfo. It will also preserve ScalarEvolution and
 /// DominatorTree if they are non-null.
-bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
+bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
                       bool AllowRuntime, bool AllowExpensiveTripCount,
                       unsigned TripMultiple, LoopInfo *LI, ScalarEvolution *SE,
                       DominatorTree *DT, AssumptionCache *AC,
@@ -218,20 +263,48 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
   bool CompletelyUnroll = Count == TripCount;
   SmallVector<BasicBlock *, 4> ExitBlocks;
   L->getExitBlocks(ExitBlocks);
-  Loop *ParentL = L->getParentLoop();
-  bool AllExitsAreInsideParentLoop = !ParentL ||
-      std::all_of(ExitBlocks.begin(), ExitBlocks.end(),
-                  [&](BasicBlock *BB) { return ParentL->contains(BB); });
+  std::vector<BasicBlock*> OriginalLoopBlocks = L->getBlocks();
+
+  // Go through all exits of L and see if there are any phi-nodes there. We just
+  // conservatively assume that they're inserted to preserve LCSSA form, which
+  // means that complete unrolling might break this form. We need to either fix
+  // it in-place after the transformation, or entirely rebuild LCSSA. TODO: For
+  // now we just recompute LCSSA for the outer loop, but it should be possible
+  // to fix it in-place.
+  bool NeedToFixLCSSA = PreserveLCSSA && CompletelyUnroll &&
+      std::any_of(ExitBlocks.begin(), ExitBlocks.end(),
+                  [&](BasicBlock *BB) { return isa<PHINode>(BB->begin()); });
 
   // We assume a run-time trip count if the compiler cannot
   // figure out the loop trip count and the unroll-runtime
   // flag is specified.
   bool RuntimeTripCount = (TripCount == 0 && Count > 0 && AllowRuntime);
 
-  if (RuntimeTripCount &&
-      !UnrollRuntimeLoopProlog(L, Count, AllowExpensiveTripCount, LI, SE, DT,
-                               PreserveLCSSA))
-    return false;
+  // Loops containing convergent instructions must have a count that divides
+  // their TripMultiple.
+  DEBUG(
+      {
+        bool HasConvergent = false;
+        for (auto &BB : L->blocks())
+          for (auto &I : *BB)
+            if (auto CS = CallSite(&I))
+              HasConvergent |= CS.isConvergent();
+        assert((!HasConvergent || TripMultiple % Count == 0) &&
+               "Unroll count must divide trip multiple if loop contains a "
+               "convergent operation.");
+      });
+  // Don't output the runtime loop remainder if Count is a multiple of
+  // TripMultiple.  Such a remainder is never needed, and is unsafe if the loop
+  // contains a convergent instruction.
+  if (RuntimeTripCount && TripMultiple % Count != 0 &&
+      !UnrollRuntimeLoopRemainder(L, Count, AllowExpensiveTripCount,
+                                  UnrollRuntimeEpilog, LI, SE, DT, 
+                                  PreserveLCSSA)) {
+    if (Force)
+      RuntimeTripCount = false;
+    else
+      return false;
+  }
 
   // Notify ScalarEvolution that the loop will be substantially changed,
   // if not outright eliminated.
@@ -308,6 +381,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
   LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
   LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
 
+  std::vector<BasicBlock*> UnrolledLoopBlocks = L->getBlocks();
   for (unsigned It = 1; It != Count; ++It) {
     std::vector<BasicBlock*> NewBlocks;
     SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
@@ -349,13 +423,13 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
       if (*BB == Header)
         // Loop over all of the PHI nodes in the block, changing them to use
         // the incoming values from the previous block.
-        for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) {
-          PHINode *NewPHI = cast<PHINode>(VMap[OrigPHINode[i]]);
+        for (PHINode *OrigPHI : OrigPHINode) {
+          PHINode *NewPHI = cast<PHINode>(VMap[OrigPHI]);
           Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);
           if (Instruction *InValI = dyn_cast<Instruction>(InVal))
             if (It > 1 && L->contains(InValI))
               InVal = LastValueMap[InValI];
-          VMap[OrigPHINode[i]] = InVal;
+          VMap[OrigPHI] = InVal;
           New->getInstList().erase(NewPHI);
         }
 
@@ -366,11 +440,10 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
         LastValueMap[VI->first] = VI->second;
 
       // Add phi entries for newly created values to all exit blocks.
-      for (succ_iterator SI = succ_begin(*BB), SE = succ_end(*BB);
-           SI != SE; ++SI) {
-        if (L->contains(*SI))
+      for (BasicBlock *Succ : successors(*BB)) {
+        if (L->contains(Succ))
           continue;
-        for (BasicBlock::iterator BBI = (*SI)->begin();
+        for (BasicBlock::iterator BBI = Succ->begin();
              PHINode *phi = dyn_cast<PHINode>(BBI); ++BBI) {
           Value *Incoming = phi->getIncomingValueForBlock(*BB);
           ValueToValueMapTy::iterator It = LastValueMap.find(Incoming);
@@ -387,18 +460,33 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
         Latches.push_back(New);
 
       NewBlocks.push_back(New);
+      UnrolledLoopBlocks.push_back(New);
+
+      // Update DomTree: since we just copy the loop body, and each copy has a
+      // dedicated entry block (copy of the header block), this header's copy
+      // dominates all copied blocks. That means, dominance relations in the
+      // copied body are the same as in the original body.
+      if (DT) {
+        if (*BB == Header)
+          DT->addNewBlock(New, Latches[It - 1]);
+        else {
+          auto BBDomNode = DT->getNode(*BB);
+          auto BBIDom = BBDomNode->getIDom();
+          BasicBlock *OriginalBBIDom = BBIDom->getBlock();
+          DT->addNewBlock(
+              New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
+        }
+      }
     }
 
     // Remap all instructions in the most recent iteration
-    for (unsigned i = 0; i < NewBlocks.size(); ++i)
-      for (BasicBlock::iterator I = NewBlocks[i]->begin(),
-           E = NewBlocks[i]->end(); I != E; ++I)
-        ::RemapInstruction(&*I, LastValueMap);
+    for (BasicBlock *NewBlock : NewBlocks)
+      for (Instruction &I : *NewBlock)
+        ::remapInstruction(&I, LastValueMap);
   }
 
   // Loop over the PHI nodes in the original block, setting incoming values.
-  for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) {
-    PHINode *PN = OrigPHINode[i];
+  for (PHINode *PN : OrigPHINode) {
     if (CompletelyUnroll) {
       PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));
       Header->getInstList().erase(PN);
@@ -453,11 +541,10 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
       // Remove phi operands at this loop exit
       if (Dest != LoopExit) {
         BasicBlock *BB = Latches[i];
-        for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
-             SI != SE; ++SI) {
-          if (*SI == Headers[i])
+        for (BasicBlock *Succ: successors(BB)) {
+          if (Succ == Headers[i])
             continue;
-          for (BasicBlock::iterator BBI = (*SI)->begin();
+          for (BasicBlock::iterator BBI = Succ->begin();
                PHINode *Phi = dyn_cast<PHINode>(BBI); ++BBI) {
             Phi->removeIncomingValue(BB, false);
           }
@@ -468,16 +555,43 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
       Term->eraseFromParent();
     }
   }
+  // Update dominators of blocks we might reach through exits.
+  // Immediate dominator of such block might change, because we add more
+  // routes which can lead to the exit: we can now reach it from the copied
+  // iterations too. Thus, the new idom of the block will be the nearest
+  // common dominator of the previous idom and common dominator of all copies of
+  // the previous idom. This is equivalent to the nearest common dominator of
+  // the previous idom and the first latch, which dominates all copies of the
+  // previous idom.
+  if (DT && Count > 1) {
+    for (auto *BB : OriginalLoopBlocks) {
+      auto *BBDomNode = DT->getNode(BB);
+      SmallVector<BasicBlock *, 16> ChildrenToUpdate;
+      for (auto *ChildDomNode : BBDomNode->getChildren()) {
+        auto *ChildBB = ChildDomNode->getBlock();
+        if (!L->contains(ChildBB))
+          ChildrenToUpdate.push_back(ChildBB);
+      }
+      BasicBlock *NewIDom = DT->findNearestCommonDominator(BB, Latches[0]);
+      for (auto *ChildBB : ChildrenToUpdate)
+        DT->changeImmediateDominator(ChildBB, NewIDom);
+    }
+  }
 
   // Merge adjacent basic blocks, if possible.
   SmallPtrSet<Loop *, 4> ForgottenLoops;
-  for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
-    BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator());
+  for (BasicBlock *Latch : Latches) {
+    BranchInst *Term = cast<BranchInst>(Latch->getTerminator());
     if (Term->isUnconditional()) {
       BasicBlock *Dest = Term->getSuccessor(0);
-      if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI, SE,
-                                                      ForgottenLoops))
+      if (BasicBlock *Fold =
+              foldBlockIntoPredecessor(Dest, LI, SE, ForgottenLoops, DT)) {
+        // Dest has been folded into Fold. Update our worklists accordingly.
         std::replace(Latches.begin(), Latches.end(), Dest, Fold);
+        UnrolledLoopBlocks.erase(std::remove(UnrolledLoopBlocks.begin(),
+                                             UnrolledLoopBlocks.end(), Dest),
+                                 UnrolledLoopBlocks.end());
+      }
     }
   }
 
@@ -485,10 +599,12 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
   // whole function's cache.
   AC->clear();
 
-  // FIXME: Reconstruct dom info, because it is not preserved properly.
-  // Incrementally updating domtree after loop unrolling would be easy.
-  if (DT)
+  // FIXME: We only preserve DT info for complete unrolling now. Incrementally
+  // updating domtree after partial loop unrolling should also be easy.
+  if (DT && !CompletelyUnroll)
     DT->recalculate(*L->getHeader()->getParent());
+  else if (DT)
+    DEBUG(DT->verifyDomTree());
 
   // Simplify any new induction variables in the partially unrolled loop.
   if (SE && !CompletelyUnroll) {
@@ -508,19 +624,17 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
   // go.
   const DataLayout &DL = Header->getModule()->getDataLayout();
   const std::vector<BasicBlock*> &NewLoopBlocks = L->getBlocks();
-  for (std::vector<BasicBlock*>::const_iterator BB = NewLoopBlocks.begin(),
-       BBE = NewLoopBlocks.end(); BB != BBE; ++BB)
-    for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E; ) {
+  for (BasicBlock *BB : NewLoopBlocks) {
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
       Instruction *Inst = &*I++;
 
-      if (isInstructionTriviallyDead(Inst))
-        (*BB)->getInstList().erase(Inst);
-      else if (Value *V = SimplifyInstruction(Inst, DL))
-        if (LI->replacementPreservesLCSSAForm(Inst, V)) {
+      if (Value *V = SimplifyInstruction(Inst, DL))
+        if (LI->replacementPreservesLCSSAForm(Inst, V))
           Inst->replaceAllUsesWith(V);
-          (*BB)->getInstList().erase(Inst);
-        }
+      if (isInstructionTriviallyDead(Inst))
+        BB->getInstList().erase(Inst);
     }
+  }
 
   NumCompletelyUnrolled += CompletelyUnroll;
   ++NumUnrolled;
@@ -530,6 +644,17 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
   if (CompletelyUnroll)
     LI->markAsRemoved(L);
 
+  // After complete unrolling most of the blocks should be contained in OuterL.
+  // However, some of them might happen to be out of OuterL (e.g. if they
+  // precede a loop exit). In this case we might need to insert PHI nodes in
+  // order to preserve LCSSA form.
+  // We don't need to check this if we already know that we need to fix LCSSA
+  // form.
+  // TODO: For now we just recompute LCSSA for the outer loop in this case, but
+  // it should be possible to fix it in-place.
+  if (PreserveLCSSA && OuterL && CompletelyUnroll && !NeedToFixLCSSA)
+    NeedToFixLCSSA |= ::needToInsertPhisForLCSSA(OuterL, UnrolledLoopBlocks, LI);
+
   // If we have a pass and a DominatorTree we should re-simplify impacted loops
   // to ensure subsequent analyses can rely on this form. We want to simplify
   // at least one layer outside of the loop that was unrolled so that any
@@ -538,7 +663,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
     if (!OuterL && !CompletelyUnroll)
       OuterL = L;
     if (OuterL) {
-      bool Simplified = simplifyLoop(OuterL, DT, LI, SE, AC, PreserveLCSSA);
+      simplifyLoop(OuterL, DT, LI, SE, AC, PreserveLCSSA);
 
       // LCSSA must be performed on the outermost affected loop. The unrolled
       // loop's last loop latch is guaranteed to be in the outermost loop after
@@ -548,7 +673,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
         while (OuterL->getParentLoop() != LatchLoop)
           OuterL = OuterL->getParentLoop();
 
-      if (CompletelyUnroll && (!AllExitsAreInsideParentLoop || Simplified))
+      if (NeedToFixLCSSA)
         formLCSSARecursively(*OuterL, *DT, LI, SE);
       else
         assert(OuterL->isLCSSAForm(*DT) &&
diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 0d68f18ad0e5e..861a50cf354d8 100644
--- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -16,8 +16,8 @@
 // case, we need to generate code to execute these 'left over' iterations.
 //
 // The current strategy generates an if-then-else sequence prior to the
-// unrolled loop to execute the 'left over' iterations.  Other strategies
-// include generate a loop before or after the unrolled loop.
+// unrolled loop to execute the 'left over' iterations before or after the
+// unrolled loop.
 //
 //===----------------------------------------------------------------------===//
 
@@ -60,91 +60,220 @@ STATISTIC(NumRuntimeUnrolled,
 ///   than the unroll factor.
 ///
 static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
-                          BasicBlock *LastPrologBB, BasicBlock *PrologEnd,
-                          BasicBlock *OrigPH, BasicBlock *NewPH,
-                          ValueToValueMapTy &VMap, DominatorTree *DT,
-                          LoopInfo *LI, bool PreserveLCSSA) {
+                          BasicBlock *PrologExit, BasicBlock *PreHeader,
+                          BasicBlock *NewPreHeader, ValueToValueMapTy &VMap,
+                          DominatorTree *DT, LoopInfo *LI, bool PreserveLCSSA) {
   BasicBlock *Latch = L->getLoopLatch();
   assert(Latch && "Loop must have a latch");
+  BasicBlock *PrologLatch = cast<BasicBlock>(VMap[Latch]);
 
   // Create a PHI node for each outgoing value from the original loop
   // (which means it is an outgoing value from the prolog code too).
   // The new PHI node is inserted in the prolog end basic block.
-  // The new PHI name is added as an operand of a PHI node in either
+  // The new PHI node value is added as an operand of a PHI node in either
   // the loop header or the loop exit block.
-  for (succ_iterator SBI = succ_begin(Latch), SBE = succ_end(Latch);
-       SBI != SBE; ++SBI) {
-    for (BasicBlock::iterator BBI = (*SBI)->begin();
-         PHINode *PN = dyn_cast<PHINode>(BBI); ++BBI) {
-
+  for (BasicBlock *Succ : successors(Latch)) {
+    for (Instruction &BBI : *Succ) {
+      PHINode *PN = dyn_cast<PHINode>(&BBI);
+      // Exit when we passed all PHI nodes.
+      if (!PN)
+        break;
       // Add a new PHI node to the prolog end block and add the
       // appropriate incoming values.
-      PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName()+".unr",
-                                       PrologEnd->getTerminator());
+      PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName() + ".unr",
+                                       PrologExit->getFirstNonPHI());
       // Adding a value to the new PHI node from the original loop preheader.
       // This is the value that skips all the prolog code.
       if (L->contains(PN)) {
-        NewPN->addIncoming(PN->getIncomingValueForBlock(NewPH), OrigPH);
+        NewPN->addIncoming(PN->getIncomingValueForBlock(NewPreHeader),
+                           PreHeader);
       } else {
-        NewPN->addIncoming(UndefValue::get(PN->getType()), OrigPH);
+        NewPN->addIncoming(UndefValue::get(PN->getType()), PreHeader);
       }
 
       Value *V = PN->getIncomingValueForBlock(Latch);
       if (Instruction *I = dyn_cast<Instruction>(V)) {
         if (L->contains(I)) {
-          V = VMap[I];
+          V = VMap.lookup(I);
         }
       }
       // Adding a value to the new PHI node from the last prolog block
       // that was created.
-      NewPN->addIncoming(V, LastPrologBB);
+      NewPN->addIncoming(V, PrologLatch);
 
       // Update the existing PHI node operand with the value from the
       // new PHI node.  How this is done depends on if the existing
       // PHI node is in the original loop block, or the exit block.
       if (L->contains(PN)) {
-        PN->setIncomingValue(PN->getBasicBlockIndex(NewPH), NewPN);
+        PN->setIncomingValue(PN->getBasicBlockIndex(NewPreHeader), NewPN);
       } else {
-        PN->addIncoming(NewPN, PrologEnd);
+        PN->addIncoming(NewPN, PrologExit);
       }
     }
   }
 
-  // Create a branch around the orignal loop, which is taken if there are no
+  // Create a branch around the original loop, which is taken if there are no
   // iterations remaining to be executed after running the prologue.
-  Instruction *InsertPt = PrologEnd->getTerminator();
+  Instruction *InsertPt = PrologExit->getTerminator();
   IRBuilder<> B(InsertPt);
 
   assert(Count != 0 && "nonsensical Count!");
 
-  // If BECount <u (Count - 1) then (BECount + 1) & (Count - 1) == (BECount + 1)
-  // (since Count is a power of 2).  This means %xtraiter is (BECount + 1) and
-  // and all of the iterations of this loop were executed by the prologue.  Note
-  // that if BECount <u (Count - 1) then (BECount + 1) cannot unsigned-overflow.
+  // If BECount <u (Count - 1) then (BECount + 1) % Count == (BECount + 1)
+  // This means %xtraiter is (BECount + 1) and all of the iterations of this
+  // loop were executed by the prologue.  Note that if BECount <u (Count - 1)
+  // then (BECount + 1) cannot unsigned-overflow.
   Value *BrLoopExit =
       B.CreateICmpULT(BECount, ConstantInt::get(BECount->getType(), Count - 1));
   BasicBlock *Exit = L->getUniqueExitBlock();
   assert(Exit && "Loop must have a single exit block only");
   // Split the exit to maintain loop canonicalization guarantees
-  SmallVector<BasicBlock*, 4> Preds(pred_begin(Exit), pred_end(Exit));
+  SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
   SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", DT, LI,
                          PreserveLCSSA);
   // Add the branch to the exit block (around the unrolled loop)
-  B.CreateCondBr(BrLoopExit, Exit, NewPH);
+  B.CreateCondBr(BrLoopExit, Exit, NewPreHeader);
+  InsertPt->eraseFromParent();
+}
+
+/// Connect the unrolling epilog code to the original loop.
+/// The unrolling epilog code contains code to execute the
+/// 'extra' iterations if the run-time trip count modulo the
+/// unroll count is non-zero.
+///
+/// This function performs the following:
+/// - Update PHI nodes at the unrolling loop exit and epilog loop exit
+/// - Create PHI nodes at the unrolling loop exit to combine
+///   values that exit the unrolling loop code and jump around it.
+/// - Update PHI operands in the epilog loop by the new PHI nodes
+/// - Branch around the epilog loop if extra iters (ModVal) is zero.
+///
+static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
+                          BasicBlock *Exit, BasicBlock *PreHeader,
+                          BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader,
+                          ValueToValueMapTy &VMap, DominatorTree *DT,
+                          LoopInfo *LI, bool PreserveLCSSA)  {
+  BasicBlock *Latch = L->getLoopLatch();
+  assert(Latch && "Loop must have a latch");
+  BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]);
+
+  // Loop structure should be the following:
+  //
+  // PreHeader
+  // NewPreHeader
+  //   Header
+  //   ...
+  //   Latch
+  // NewExit (PN)
+  // EpilogPreHeader
+  //   EpilogHeader
+  //   ...
+  //   EpilogLatch
+  // Exit (EpilogPN)
+
+  // Update PHI nodes at NewExit and Exit.
+  for (Instruction &BBI : *NewExit) {
+    PHINode *PN = dyn_cast<PHINode>(&BBI);
+    // Exit when we passed all PHI nodes.
+    if (!PN)
+      break;
+    // PN should be used in another PHI located in Exit block as
+    // Exit was split by SplitBlockPredecessors into Exit and NewExit
+    // Basicaly it should look like:
+    // NewExit:
+    //   PN = PHI [I, Latch]
+    // ...
+    // Exit:
+    //   EpilogPN = PHI [PN, EpilogPreHeader]
+    //
+    // There is EpilogPreHeader incoming block instead of NewExit as
+    // NewExit was spilt 1 more time to get EpilogPreHeader.
+    assert(PN->hasOneUse() && "The phi should have 1 use");
+    PHINode *EpilogPN = cast<PHINode> (PN->use_begin()->getUser());
+    assert(EpilogPN->getParent() == Exit && "EpilogPN should be in Exit block");
+
+    // Add incoming PreHeader from branch around the Loop
+    PN->addIncoming(UndefValue::get(PN->getType()), PreHeader);
+
+    Value *V = PN->getIncomingValueForBlock(Latch);
+    Instruction *I = dyn_cast<Instruction>(V);
+    if (I && L->contains(I))
+      // If value comes from an instruction in the loop add VMap value.
+      V = VMap.lookup(I);
+    // For the instruction out of the loop, constant or undefined value
+    // insert value itself.
+    EpilogPN->addIncoming(V, EpilogLatch);
+
+    assert(EpilogPN->getBasicBlockIndex(EpilogPreHeader) >= 0 &&
+          "EpilogPN should have EpilogPreHeader incoming block");
+    // Change EpilogPreHeader incoming block to NewExit.
+    EpilogPN->setIncomingBlock(EpilogPN->getBasicBlockIndex(EpilogPreHeader),
+                               NewExit);
+    // Now PHIs should look like:
+    // NewExit:
+    //   PN = PHI [I, Latch], [undef, PreHeader]
+    // ...
+    // Exit:
+    //   EpilogPN = PHI [PN, NewExit], [VMap[I], EpilogLatch]
+  }
+
+  // Create PHI nodes at NewExit (from the unrolling loop Latch and PreHeader).
+  // Update corresponding PHI nodes in epilog loop.
+  for (BasicBlock *Succ : successors(Latch)) {
+    // Skip this as we already updated phis in exit blocks.
+    if (!L->contains(Succ))
+      continue;
+    for (Instruction &BBI : *Succ) {
+      PHINode *PN = dyn_cast<PHINode>(&BBI);
+      // Exit when we passed all PHI nodes.
+      if (!PN)
+        break;
+      // Add new PHI nodes to the loop exit block and update epilog
+      // PHIs with the new PHI values.
+      PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName() + ".unr",
+                                       NewExit->getFirstNonPHI());
+      // Adding a value to the new PHI node from the unrolling loop preheader.
+      NewPN->addIncoming(PN->getIncomingValueForBlock(NewPreHeader), PreHeader);
+      // Adding a value to the new PHI node from the unrolling loop latch.
+      NewPN->addIncoming(PN->getIncomingValueForBlock(Latch), Latch);
+
+      // Update the existing PHI node operand with the value from the new PHI
+      // node.  Corresponding instruction in epilog loop should be PHI.
+      PHINode *VPN = cast<PHINode>(VMap[&BBI]);
+      VPN->setIncomingValue(VPN->getBasicBlockIndex(EpilogPreHeader), NewPN);
+    }
+  }
+
+  Instruction *InsertPt = NewExit->getTerminator();
+  IRBuilder<> B(InsertPt);
+  Value *BrLoopExit = B.CreateIsNotNull(ModVal, "lcmp.mod");
+  assert(Exit && "Loop must have a single exit block only");
+  // Split the exit to maintain loop canonicalization guarantees
+  SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
+  SplitBlockPredecessors(Exit, Preds, ".epilog-lcssa", DT, LI,
+                         PreserveLCSSA);
+  // Add the branch to the exit block (around the unrolling loop)
+  B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit);
   InsertPt->eraseFromParent();
 }
 
 /// Create a clone of the blocks in a loop and connect them together.
-/// If UnrollProlog is true, loop structure will not be cloned, otherwise a new
-/// loop will be created including all cloned blocks, and the iterator of it
-/// switches to count NewIter down to 0.
+/// If CreateRemainderLoop is false, loop structure will not be cloned,
+/// otherwise a new loop will be created including all cloned blocks, and the
+/// iterator of it switches to count NewIter down to 0.
+/// The cloned blocks should be inserted between InsertTop and InsertBot.
+/// If loop structure is cloned InsertTop should be new preheader, InsertBot
+/// new loop exit.
 ///
-static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
+static void CloneLoopBlocks(Loop *L, Value *NewIter,
+                            const bool CreateRemainderLoop,
+                            const bool UseEpilogRemainder,
                             BasicBlock *InsertTop, BasicBlock *InsertBot,
+                            BasicBlock *Preheader,
                             std::vector<BasicBlock *> &NewBlocks,
                             LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap,
                             LoopInfo *LI) {
-  BasicBlock *Preheader = L->getLoopPreheader();
+  StringRef suffix = UseEpilogRemainder ? "epil" : "prol";
   BasicBlock *Header = L->getHeader();
   BasicBlock *Latch = L->getLoopLatch();
   Function *F = Header->getParent();
@@ -152,7 +281,7 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
   LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
   Loop *NewLoop = nullptr;
   Loop *ParentLoop = L->getParentLoop();
-  if (!UnrollProlog) {
+  if (CreateRemainderLoop) {
     NewLoop = new Loop();
     if (ParentLoop)
       ParentLoop->addChildLoop(NewLoop);
@@ -163,7 +292,7 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
   // For each block in the original loop, create a new copy,
   // and update the value map with the newly created values.
   for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
-    BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".prol", F);
+    BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, "." + suffix, F);
     NewBlocks.push_back(NewBB);
 
     if (NewLoop)
@@ -176,19 +305,20 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
       // For the first block, add a CFG connection to this newly
       // created block.
       InsertTop->getTerminator()->setSuccessor(0, NewBB);
-
     }
+
     if (Latch == *BB) {
-      // For the last block, if UnrollProlog is true, create a direct jump to
-      // InsertBot. If not, create a loop back to cloned head.
+      // For the last block, if CreateRemainderLoop is false, create a direct
+      // jump to InsertBot. If not, create a loop back to cloned head.
       VMap.erase((*BB)->getTerminator());
       BasicBlock *FirstLoopBB = cast<BasicBlock>(VMap[Header]);
       BranchInst *LatchBR = cast<BranchInst>(NewBB->getTerminator());
       IRBuilder<> Builder(LatchBR);
-      if (UnrollProlog) {
+      if (!CreateRemainderLoop) {
         Builder.CreateBr(InsertBot);
       } else {
-        PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2, "prol.iter",
+        PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2,
+                                          suffix + ".iter",
                                           FirstLoopBB->getFirstNonPHI());
         Value *IdxSub =
             Builder.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
@@ -207,9 +337,15 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
   // cloned loop.
   for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
     PHINode *NewPHI = cast<PHINode>(VMap[&*I]);
-    if (UnrollProlog) {
-      VMap[&*I] = NewPHI->getIncomingValueForBlock(Preheader);
-      cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);
+    if (!CreateRemainderLoop) {
+      if (UseEpilogRemainder) {
+        unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
+        NewPHI->setIncomingBlock(idx, InsertTop);
+        NewPHI->removeIncomingValue(Latch, false);
+      } else {
+        VMap[&*I] = NewPHI->getIncomingValueForBlock(Preheader);
+        cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);
+      }
     } else {
       unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
       NewPHI->setIncomingBlock(idx, InsertTop);
@@ -217,8 +353,8 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
       idx = NewPHI->getBasicBlockIndex(Latch);
       Value *InVal = NewPHI->getIncomingValue(idx);
       NewPHI->setIncomingBlock(idx, NewLatch);
-      if (VMap[InVal])
-        NewPHI->setIncomingValue(idx, VMap[InVal]);
+      if (Value *V = VMap.lookup(InVal))
+        NewPHI->setIncomingValue(idx, V);
     }
   }
   if (NewLoop) {
@@ -254,11 +390,11 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
   }
 }
 
-/// Insert code in the prolog code when unrolling a loop with a
+/// Insert code in the prolog/epilog code when unrolling a loop with a
 /// run-time trip-count.
 ///
 /// This method assumes that the loop unroll factor is total number
-/// of loop bodes in the loop after unrolling. (Some folks refer
+/// of loop bodies in the loop after unrolling. (Some folks refer
 /// to the unroll factor as the number of *extra* copies added).
 /// We assume also that the loop unroll factor is a power-of-two. So, after
 /// unrolling the loop, the number of loop bodies executed is 2,
@@ -266,37 +402,56 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
 /// instruction in SimplifyCFG.cpp.  Then, the backend decides how code for
 /// the switch instruction is generated.
 ///
+/// ***Prolog case***
 ///        extraiters = tripcount % loopfactor
 ///        if (extraiters == 0) jump Loop:
-///        else jump Prol
+///        else jump Prol:
 /// Prol:  LoopBody;
 ///        extraiters -= 1                 // Omitted if unroll factor is 2.
 ///        if (extraiters != 0) jump Prol: // Omitted if unroll factor is 2.
-///        if (tripcount < loopfactor) jump End
+///        if (tripcount < loopfactor) jump End:
 /// Loop:
 /// ...
 /// End:
 ///
-bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
-                                   bool AllowExpensiveTripCount, LoopInfo *LI,
-                                   ScalarEvolution *SE, DominatorTree *DT,
-                                   bool PreserveLCSSA) {
+/// ***Epilog case***
+///        extraiters = tripcount % loopfactor
+///        if (tripcount < loopfactor) jump LoopExit:
+///        unroll_iters = tripcount - extraiters
+/// Loop:  LoopBody; (executes unroll_iter times);
+///        unroll_iter -= 1
+///        if (unroll_iter != 0) jump Loop:
+/// LoopExit:
+///        if (extraiters == 0) jump EpilExit:
+/// Epil:  LoopBody; (executes extraiters times)
+///        extraiters -= 1                 // Omitted if unroll factor is 2.
+///        if (extraiters != 0) jump Epil: // Omitted if unroll factor is 2.
+/// EpilExit:
+
+bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
+                                      bool AllowExpensiveTripCount,
+                                      bool UseEpilogRemainder,
+                                      LoopInfo *LI, ScalarEvolution *SE,
+                                      DominatorTree *DT, bool PreserveLCSSA) {
   // for now, only unroll loops that contain a single exit
   if (!L->getExitingBlock())
     return false;
 
   // Make sure the loop is in canonical form, and there is a single
   // exit block only.
-  if (!L->isLoopSimplifyForm() || !L->getUniqueExitBlock())
+  if (!L->isLoopSimplifyForm())
+    return false;
+  BasicBlock *Exit = L->getUniqueExitBlock(); // successor out of loop
+  if (!Exit)
     return false;
 
-  // Use Scalar Evolution to compute the trip count.  This allows more
-  // loops to be unrolled than relying on induction var simplification
+  // Use Scalar Evolution to compute the trip count. This allows more loops to
+  // be unrolled than relying on induction var simplification.
   if (!SE)
     return false;
 
-  // Only unroll loops with a computable trip count and the trip count needs
-  // to be an int value (allowing a pointer type is a TODO item)
+  // Only unroll loops with a computable trip count, and the trip count needs
+  // to be an int value (allowing a pointer type is a TODO item).
   const SCEV *BECountSC = SE->getBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(BECountSC) ||
       !BECountSC->getType()->isIntegerTy())
@@ -304,21 +459,19 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
 
   unsigned BEWidth = cast<IntegerType>(BECountSC->getType())->getBitWidth();
 
-  // Add 1 since the backedge count doesn't include the first loop iteration
+  // Add 1 since the backedge count doesn't include the first loop iteration.
   const SCEV *TripCountSC =
       SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1));
   if (isa<SCEVCouldNotCompute>(TripCountSC))
     return false;
 
   BasicBlock *Header = L->getHeader();
+  BasicBlock *PreHeader = L->getLoopPreheader();
+  BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
   const DataLayout &DL = Header->getModule()->getDataLayout();
   SCEVExpander Expander(*SE, DL, "loop-unroll");
-  if (!AllowExpensiveTripCount && Expander.isHighCostExpansion(TripCountSC, L))
-    return false;
-
-  // We only handle cases when the unroll factor is a power of 2.
-  // Count is the loop unroll factor, the number of extra copies added + 1.
-  if (!isPowerOf2_32(Count))
+  if (!AllowExpensiveTripCount &&
+      Expander.isHighCostExpansion(TripCountSC, L, PreHeaderBR))
     return false;
 
   // This constraint lets us deal with an overflowing trip count easily; see the
@@ -326,51 +479,115 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
   if (Log2_32(Count) > BEWidth)
     return false;
 
-  // If this loop is nested, then the loop unroller changes the code in
-  // parent loop, so the Scalar Evolution pass needs to be run again
+  // If this loop is nested, then the loop unroller changes the code in the
+  // parent loop, so the Scalar Evolution pass needs to be run again.
   if (Loop *ParentLoop = L->getParentLoop())
     SE->forgetLoop(ParentLoop);
 
-  BasicBlock *PH = L->getLoopPreheader();
   BasicBlock *Latch = L->getLoopLatch();
-  // It helps to splits the original preheader twice, one for the end of the
-  // prolog code and one for a new loop preheader
-  BasicBlock *PEnd = SplitEdge(PH, Header, DT, LI);
-  BasicBlock *NewPH = SplitBlock(PEnd, PEnd->getTerminator(), DT, LI);
-  BranchInst *PreHeaderBR = cast<BranchInst>(PH->getTerminator());
 
+  // Loop structure is the following:
+  //
+  // PreHeader
+  //   Header
+  //   ...
+  //   Latch
+  // Exit
+
+  BasicBlock *NewPreHeader;
+  BasicBlock *NewExit = nullptr;
+  BasicBlock *PrologExit = nullptr;
+  BasicBlock *EpilogPreHeader = nullptr;
+  BasicBlock *PrologPreHeader = nullptr;
+
+  if (UseEpilogRemainder) {
+    // If epilog remainder
+    // Split PreHeader to insert a branch around loop for unrolling.
+    NewPreHeader = SplitBlock(PreHeader, PreHeader->getTerminator(), DT, LI);
+    NewPreHeader->setName(PreHeader->getName() + ".new");
+    // Split Exit to create phi nodes from branch above.
+    SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
+    NewExit = SplitBlockPredecessors(Exit, Preds, ".unr-lcssa",
+                                     DT, LI, PreserveLCSSA);
+    // Split NewExit to insert epilog remainder loop.
+    EpilogPreHeader = SplitBlock(NewExit, NewExit->getTerminator(), DT, LI);
+    EpilogPreHeader->setName(Header->getName() + ".epil.preheader");
+  } else {
+    // If prolog remainder
+    // Split the original preheader twice to insert prolog remainder loop
+    PrologPreHeader = SplitEdge(PreHeader, Header, DT, LI);
+    PrologPreHeader->setName(Header->getName() + ".prol.preheader");
+    PrologExit = SplitBlock(PrologPreHeader, PrologPreHeader->getTerminator(),
+                            DT, LI);
+    PrologExit->setName(Header->getName() + ".prol.loopexit");
+    // Split PrologExit to get NewPreHeader.
+    NewPreHeader = SplitBlock(PrologExit, PrologExit->getTerminator(), DT, LI);
+    NewPreHeader->setName(PreHeader->getName() + ".new");
+  }
+  // Loop structure should be the following:
+  //  Epilog             Prolog
+  //
+  // PreHeader         PreHeader
+  // *NewPreHeader     *PrologPreHeader
+  //   Header          *PrologExit
+  //   ...             *NewPreHeader
+  //   Latch             Header
+  // *NewExit            ...
+  // *EpilogPreHeader    Latch
+  // Exit              Exit
+
+  // Calculate conditions for branch around loop for unrolling
+  // in epilog case and around prolog remainder loop in prolog case.
   // Compute the number of extra iterations required, which is:
-  //  extra iterations = run-time trip count % (loop unroll factor + 1)
+  //  extra iterations = run-time trip count % loop unroll factor
+  PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
   Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(),
                                             PreHeaderBR);
   Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(),
                                           PreHeaderBR);
-
   IRBuilder<> B(PreHeaderBR);
-  Value *ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter");
-
-  // If ModVal is zero, we know that either
-  //  1. there are no iteration to be run in the prologue loop
-  // OR
-  //  2. the addition computing TripCount overflowed
-  //
-  // If (2) is true, we know that TripCount really is (1 << BEWidth) and so the
-  // number of iterations that remain to be run in the original loop is a
-  // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we
-  // explicitly check this above).
-
-  Value *BranchVal = B.CreateIsNotNull(ModVal, "lcmp.mod");
-
-  // Branch to either the extra iterations or the cloned/unrolled loop
-  // We will fix up the true branch label when adding loop body copies
-  B.CreateCondBr(BranchVal, PEnd, PEnd);
-  assert(PreHeaderBR->isUnconditional() &&
-         PreHeaderBR->getSuccessor(0) == PEnd &&
-         "CFG edges in Preheader are not correct");
+  Value *ModVal;
+  // Calculate ModVal = (BECount + 1) % Count.
+  // Note that TripCount is BECount + 1.
+  if (isPowerOf2_32(Count)) {
+    // When Count is power of 2 we don't BECount for epilog case, however we'll
+    // need it for a branch around unrolling loop for prolog case.
+    ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter");
+    //  1. There are no iterations to be run in the prolog/epilog loop.
+    // OR
+    //  2. The addition computing TripCount overflowed.
+    //
+    // If (2) is true, we know that TripCount really is (1 << BEWidth) and so
+    // the number of iterations that remain to be run in the original loop is a
+    // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we
+    // explicitly check this above).
+  } else {
+    // As (BECount + 1) can potentially unsigned overflow we count
+    // (BECount % Count) + 1 which is overflow safe as BECount % Count < Count.
+    Value *ModValTmp = B.CreateURem(BECount,
+                                    ConstantInt::get(BECount->getType(),
+                                                     Count));
+    Value *ModValAdd = B.CreateAdd(ModValTmp,
+                                   ConstantInt::get(ModValTmp->getType(), 1));
+    // At that point (BECount % Count) + 1 could be equal to Count.
+    // To handle this case we need to take mod by Count one more time.
+    ModVal = B.CreateURem(ModValAdd,
+                          ConstantInt::get(BECount->getType(), Count),
+                          "xtraiter");
+  }
+  Value *BranchVal =
+      UseEpilogRemainder ? B.CreateICmpULT(BECount,
+                                           ConstantInt::get(BECount->getType(),
+                                                            Count - 1)) :
+                           B.CreateIsNotNull(ModVal, "lcmp.mod");
+  BasicBlock *RemainderLoop = UseEpilogRemainder ? NewExit : PrologPreHeader;
+  BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit;
+  // Branch to either remainder (extra iterations) loop or unrolling loop.
+  B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop);
   PreHeaderBR->eraseFromParent();
   Function *F = Header->getParent();
   // Get an ordered list of blocks in the loop to help with the ordering of the
-  // cloned blocks in the prolog code
+  // cloned blocks in the prolog/epilog code
   LoopBlocksDFS LoopBlocks(L);
   LoopBlocks.perform(LI);
 
@@ -382,34 +599,80 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count,
   std::vector<BasicBlock *> NewBlocks;
   ValueToValueMapTy VMap;
 
-  bool UnrollPrologue = Count == 2;
+  // For unroll factor 2 remainder loop will have 1 iterations.
+  // Do not create 1 iteration loop.
+  bool CreateRemainderLoop = (Count != 2);
 
   // Clone all the basic blocks in the loop. If Count is 2, we don't clone
   // the loop, otherwise we create a cloned loop to execute the extra
   // iterations. This function adds the appropriate CFG connections.
-  CloneLoopBlocks(L, ModVal, UnrollPrologue, PH, PEnd, NewBlocks, LoopBlocks,
-                  VMap, LI);
-
-  // Insert the cloned blocks into function just before the original loop
-  F->getBasicBlockList().splice(PEnd->getIterator(), F->getBasicBlockList(),
-                                NewBlocks[0]->getIterator(), F->end());
-
-  // Rewrite the cloned instruction operands to use the values
-  // created when the clone is created.
-  for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) {
-    for (BasicBlock::iterator I = NewBlocks[i]->begin(),
-                              E = NewBlocks[i]->end();
-         I != E; ++I) {
-      RemapInstruction(&*I, VMap,
-                       RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+  BasicBlock *InsertBot = UseEpilogRemainder ? Exit : PrologExit;
+  BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
+  CloneLoopBlocks(L, ModVal, CreateRemainderLoop, UseEpilogRemainder, InsertTop,
+                  InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, LI);
+
+  // Insert the cloned blocks into the function.
+  F->getBasicBlockList().splice(InsertBot->getIterator(),
+                                F->getBasicBlockList(),
+                                NewBlocks[0]->getIterator(),
+                                F->end());
+
+  // Loop structure should be the following:
+  //  Epilog             Prolog
+  //
+  // PreHeader         PreHeader
+  // NewPreHeader      PrologPreHeader
+  //   Header            PrologHeader
+  //   ...               ...
+  //   Latch             PrologLatch
+  // NewExit           PrologExit
+  // EpilogPreHeader   NewPreHeader
+  //   EpilogHeader      Header
+  //   ...               ...
+  //   EpilogLatch       Latch
+  // Exit              Exit
+
+  // Rewrite the cloned instruction operands to use the values created when the
+  // clone is created.
+  for (BasicBlock *BB : NewBlocks) {
+    for (Instruction &I : *BB) {
+      RemapInstruction(&I, VMap,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
     }
   }
 
-  // Connect the prolog code to the original loop and update the
-  // PHI functions.
-  BasicBlock *LastLoopBB = cast<BasicBlock>(VMap[Latch]);
-  ConnectProlog(L, BECount, Count, LastLoopBB, PEnd, PH, NewPH, VMap, DT, LI,
-                PreserveLCSSA);
+  if (UseEpilogRemainder) {
+    // Connect the epilog code to the original loop and update the
+    // PHI functions.
+    ConnectEpilog(L, ModVal, NewExit, Exit, PreHeader,
+                  EpilogPreHeader, NewPreHeader, VMap, DT, LI,
+                  PreserveLCSSA);
+
+    // Update counter in loop for unrolling.
+    // I should be multiply of Count.
+    IRBuilder<> B2(NewPreHeader->getTerminator());
+    Value *TestVal = B2.CreateSub(TripCount, ModVal, "unroll_iter");
+    BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
+    B2.SetInsertPoint(LatchBR);
+    PHINode *NewIdx = PHINode::Create(TestVal->getType(), 2, "niter",
+                                      Header->getFirstNonPHI());
+    Value *IdxSub =
+        B2.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
+                     NewIdx->getName() + ".nsub");
+    Value *IdxCmp;
+    if (LatchBR->getSuccessor(0) == Header)
+      IdxCmp = B2.CreateIsNotNull(IdxSub, NewIdx->getName() + ".ncmp");
+    else
+      IdxCmp = B2.CreateIsNull(IdxSub, NewIdx->getName() + ".ncmp");
+    NewIdx->addIncoming(TestVal, NewPreHeader);
+    NewIdx->addIncoming(IdxSub, Latch);
+    LatchBR->setCondition(IdxCmp);
+  } else {
+    // Connect the prolog code to the original loop and update the
+    // PHI functions.
+    ConnectProlog(L, BECount, Count, PrologExit, PreHeader, NewPreHeader,
+                  VMap, DT, LI, PreserveLCSSA);
+  }
   NumRuntimeUnrolled++;
   return true;
 }
diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp
index fa958e913b7bd..3902c67c6a013 100644
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@@ -11,13 +11,20 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 
@@ -423,7 +430,7 @@ RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind,
   default:
     return InstDesc(false, I);
   case Instruction::PHI:
-    return InstDesc(I, Prev.getMinMaxKind());
+    return InstDesc(I, Prev.getMinMaxKind(), Prev.getUnsafeAlgebraInst());
   case Instruction::Sub:
   case Instruction::Add:
     return InstDesc(Kind == RK_IntegerAdd, I);
@@ -466,12 +473,10 @@ bool RecurrenceDescriptor::hasMultipleUsesOf(
 bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
                                           RecurrenceDescriptor &RedDes) {
 
-  bool HasFunNoNaNAttr = false;
   BasicBlock *Header = TheLoop->getHeader();
   Function &F = *Header->getParent();
-  if (F.hasFnAttribute("no-nans-fp-math"))
-    HasFunNoNaNAttr =
-        F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
+  bool HasFunNoNaNAttr =
+      F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
 
   if (AddReductionVar(Phi, RK_IntegerAdd, TheLoop, HasFunNoNaNAttr, RedDes)) {
     DEBUG(dbgs() << "Found an ADD reduction PHI." << *Phi << "\n");
@@ -514,6 +519,43 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
   return false;
 }
 
+bool RecurrenceDescriptor::isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop,
+                                                  DominatorTree *DT) {
+
+  // Ensure the phi node is in the loop header and has two incoming values.
+  if (Phi->getParent() != TheLoop->getHeader() ||
+      Phi->getNumIncomingValues() != 2)
+    return false;
+
+  // Ensure the loop has a preheader and a single latch block. The loop
+  // vectorizer will need the latch to set up the next iteration of the loop.
+  auto *Preheader = TheLoop->getLoopPreheader();
+  auto *Latch = TheLoop->getLoopLatch();
+  if (!Preheader || !Latch)
+    return false;
+
+  // Ensure the phi node's incoming blocks are the loop preheader and latch.
+  if (Phi->getBasicBlockIndex(Preheader) < 0 ||
+      Phi->getBasicBlockIndex(Latch) < 0)
+    return false;
+
+  // Get the previous value. The previous value comes from the latch edge while
+  // the initial value comes form the preheader edge.
+  auto *Previous = dyn_cast<Instruction>(Phi->getIncomingValueForBlock(Latch));
+  if (!Previous || !TheLoop->contains(Previous) || isa<PHINode>(Previous))
+    return false;
+
+  // Ensure every user of the phi node is dominated by the previous value. The
+  // dominance requirement ensures the loop vectorizer will not need to
+  // vectorize the initial value prior to the first iteration of the loop.
+  for (User *U : Phi->users())
+    if (auto *I = dyn_cast<Instruction>(U))
+      if (!DT->dominates(Previous, I))
+        return false;
+
+  return true;
+}
+
 /// This function returns the identity element (or neutral element) for
 /// the operation K.
 Constant *RecurrenceDescriptor::getRecurrenceIdentity(RecurrenceKind K,
@@ -612,61 +654,120 @@ Value *RecurrenceDescriptor::createMinMaxOp(IRBuilder<> &Builder,
 }
 
 InductionDescriptor::InductionDescriptor(Value *Start, InductionKind K,
-                                         ConstantInt *Step)
-  : StartValue(Start), IK(K), StepValue(Step) {
+                                         const SCEV *Step)
+  : StartValue(Start), IK(K), Step(Step) {
   assert(IK != IK_NoInduction && "Not an induction");
+
+  // Start value type should match the induction kind and the value
+  // itself should not be null.
   assert(StartValue && "StartValue is null");
-  assert(StepValue && !StepValue->isZero() && "StepValue is zero");
   assert((IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) &&
          "StartValue is not a pointer for pointer induction");
   assert((IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) &&
          "StartValue is not an integer for integer induction");
-  assert(StepValue->getType()->isIntegerTy() &&
-         "StepValue is not an integer");
+
+  // Check the Step Value. It should be non-zero integer value.
+  assert((!getConstIntStepValue() || !getConstIntStepValue()->isZero()) &&
+         "Step value is zero");
+
+  assert((IK != IK_PtrInduction || getConstIntStepValue()) &&
+         "Step value should be constant for pointer induction");
+  assert(Step->getType()->isIntegerTy() && "StepValue is not an integer");
 }
 
 int InductionDescriptor::getConsecutiveDirection() const {
-  if (StepValue && (StepValue->isOne() || StepValue->isMinusOne()))
-    return StepValue->getSExtValue();
+  ConstantInt *ConstStep = getConstIntStepValue();
+  if (ConstStep && (ConstStep->isOne() || ConstStep->isMinusOne()))
+    return ConstStep->getSExtValue();
   return 0;
 }
 
-Value *InductionDescriptor::transform(IRBuilder<> &B, Value *Index) const {
+ConstantInt *InductionDescriptor::getConstIntStepValue() const {
+  if (isa<SCEVConstant>(Step))
+    return dyn_cast<ConstantInt>(cast<SCEVConstant>(Step)->getValue());
+  return nullptr;
+}
+
+Value *InductionDescriptor::transform(IRBuilder<> &B, Value *Index,
+                                      ScalarEvolution *SE,
+                                      const DataLayout& DL) const {
+
+  SCEVExpander Exp(*SE, DL, "induction");
   switch (IK) {
-  case IK_IntInduction:
+  case IK_IntInduction: {
     assert(Index->getType() == StartValue->getType() &&
            "Index type does not match StartValue type");
-    if (StepValue->isMinusOne())
-      return B.CreateSub(StartValue, Index);
-    if (!StepValue->isOne())
-      Index = B.CreateMul(Index, StepValue);
-    return B.CreateAdd(StartValue, Index);
 
-  case IK_PtrInduction:
-    assert(Index->getType() == StepValue->getType() &&
+    // FIXME: Theoretically, we can call getAddExpr() of ScalarEvolution
+    // and calculate (Start + Index * Step) for all cases, without
+    // special handling for "isOne" and "isMinusOne".
+    // But in the real life the result code getting worse. We mix SCEV
+    // expressions and ADD/SUB operations and receive redundant
+    // intermediate values being calculated in different ways and
+    // Instcombine is unable to reduce them all.
+
+    if (getConstIntStepValue() &&
+        getConstIntStepValue()->isMinusOne())
+      return B.CreateSub(StartValue, Index);
+    if (getConstIntStepValue() &&
+        getConstIntStepValue()->isOne())
+      return B.CreateAdd(StartValue, Index);
+    const SCEV *S = SE->getAddExpr(SE->getSCEV(StartValue),
+                                   SE->getMulExpr(Step, SE->getSCEV(Index)));
+    return Exp.expandCodeFor(S, StartValue->getType(), &*B.GetInsertPoint());
+  }
+  case IK_PtrInduction: {
+    assert(Index->getType() == Step->getType() &&
            "Index type does not match StepValue type");
-    if (StepValue->isMinusOne())
-      Index = B.CreateNeg(Index);
-    else if (!StepValue->isOne())
-      Index = B.CreateMul(Index, StepValue);
+    assert(isa<SCEVConstant>(Step) &&
+           "Expected constant step for pointer induction");
+    const SCEV *S = SE->getMulExpr(SE->getSCEV(Index), Step);
+    Index = Exp.expandCodeFor(S, Index->getType(), &*B.GetInsertPoint());
     return B.CreateGEP(nullptr, StartValue, Index);
-
+  }
   case IK_NoInduction:
     return nullptr;
   }
   llvm_unreachable("invalid enum");
 }
 
-bool InductionDescriptor::isInductionPHI(PHINode *Phi, ScalarEvolution *SE,
-                                         InductionDescriptor &D) {
+bool InductionDescriptor::isInductionPHI(PHINode *Phi,
+                                         PredicatedScalarEvolution &PSE,
+                                         InductionDescriptor &D,
+                                         bool Assume) {
+  Type *PhiTy = Phi->getType();
+  // We only handle integer and pointer inductions variables.
+  if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
+    return false;
+
+  const SCEV *PhiScev = PSE.getSCEV(Phi);
+  const auto *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
+
+  // We need this expression to be an AddRecExpr.
+  if (Assume && !AR)
+    AR = PSE.getAsAddRec(Phi);
+
+  if (!AR) {
+    DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
+    return false;
+  }
+
+  return isInductionPHI(Phi, PSE.getSE(), D, AR);
+}
+
+bool InductionDescriptor::isInductionPHI(PHINode *Phi,
+                                         ScalarEvolution *SE,
+                                         InductionDescriptor &D,
+                                         const SCEV *Expr) {
   Type *PhiTy = Phi->getType();
   // We only handle integer and pointer inductions variables.
   if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
     return false;
 
   // Check that the PHI is consecutive.
-  const SCEV *PhiScev = SE->getSCEV(Phi);
+  const SCEV *PhiScev = Expr ? Expr : SE->getSCEV(Phi);
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
+
   if (!AR) {
     DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
     return false;
@@ -678,17 +779,22 @@ bool InductionDescriptor::isInductionPHI(PHINode *Phi, ScalarEvolution *SE,
     Phi->getIncomingValueForBlock(AR->getLoop()->getLoopPreheader());
   const SCEV *Step = AR->getStepRecurrence(*SE);
   // Calculate the pointer stride and check if it is consecutive.
-  const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
-  if (!C)
+  // The stride may be a constant or a loop invariant integer value.
+  const SCEVConstant *ConstStep = dyn_cast<SCEVConstant>(Step);
+  if (!ConstStep && !SE->isLoopInvariant(Step, AR->getLoop()))
     return false;
 
-  ConstantInt *CV = C->getValue();
   if (PhiTy->isIntegerTy()) {
-    D = InductionDescriptor(StartValue, IK_IntInduction, CV);
+    D = InductionDescriptor(StartValue, IK_IntInduction, Step);
     return true;
   }
 
   assert(PhiTy->isPointerTy() && "The PHI must be a pointer");
+  // Pointer induction should be a constant.
+  if (!ConstStep)
+    return false;
+
+  ConstantInt *CV = ConstStep->getValue();
   Type *PointerElementType = PhiTy->getPointerElementType();
   // The pointer stride cannot be determined if the pointer element type is not
   // sized.
@@ -703,8 +809,8 @@ bool InductionDescriptor::isInductionPHI(PHINode *Phi, ScalarEvolution *SE,
   int64_t CVSize = CV->getSExtValue();
   if (CVSize % Size)
     return false;
-  auto *StepValue = ConstantInt::getSigned(CV->getType(), CVSize / Size);
-
+  auto *StepValue = SE->getConstant(CV->getType(), CVSize / Size,
+                                    true /* signed */);
   D = InductionDescriptor(StartValue, IK_PtrInduction, StepValue);
   return true;
 }
@@ -727,3 +833,137 @@ SmallVector<Instruction *, 8> llvm::findDefsUsedOutsideOfLoop(Loop *L) {
 
   return UsedOutside;
 }
+
+void llvm::getLoopAnalysisUsage(AnalysisUsage &AU) {
+  // By definition, all loop passes need the LoopInfo analysis and the
+  // Dominator tree it depends on. Because they all participate in the loop
+  // pass manager, they must also preserve these.
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+  AU.addRequired<LoopInfoWrapperPass>();
+  AU.addPreserved<LoopInfoWrapperPass>();
+
+  // We must also preserve LoopSimplify and LCSSA. We locally access their IDs
+  // here because users shouldn't directly get them from this header.
+  extern char &LoopSimplifyID;
+  extern char &LCSSAID;
+  AU.addRequiredID(LoopSimplifyID);
+  AU.addPreservedID(LoopSimplifyID);
+  AU.addRequiredID(LCSSAID);
+  AU.addPreservedID(LCSSAID);
+
+  // Loop passes are designed to run inside of a loop pass manager which means
+  // that any function analyses they require must be required by the first loop
+  // pass in the manager (so that it is computed before the loop pass manager
+  // runs) and preserved by all loop pasess in the manager. To make this
+  // reasonably robust, the set needed for most loop passes is maintained here.
+  // If your loop pass requires an analysis not listed here, you will need to
+  // carefully audit the loop pass manager nesting structure that results.
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.addPreserved<AAResultsWrapperPass>();
+  AU.addPreserved<BasicAAWrapperPass>();
+  AU.addPreserved<GlobalsAAWrapperPass>();
+  AU.addPreserved<SCEVAAWrapperPass>();
+  AU.addRequired<ScalarEvolutionWrapperPass>();
+  AU.addPreserved<ScalarEvolutionWrapperPass>();
+}
+
+/// Manually defined generic "LoopPass" dependency initialization. This is used
+/// to initialize the exact set of passes from above in \c
+/// getLoopAnalysisUsage. It can be used within a loop pass's initialization
+/// with:
+///
+///   INITIALIZE_PASS_DEPENDENCY(LoopPass)
+///
+/// As-if "LoopPass" were a pass.
+void llvm::initializeLoopPassPass(PassRegistry &Registry) {
+  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+  INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
+  INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+}
+
+/// \brief Find string metadata for loop
+///
+/// If it has a value (e.g. {"llvm.distribute", 1} return the value as an
+/// operand or null otherwise.  If the string metadata is not found return
+/// Optional's not-a-value.
+Optional<const MDOperand *> llvm::findStringMetadataForLoop(Loop *TheLoop,
+                                                            StringRef Name) {
+  MDNode *LoopID = TheLoop->getLoopID();
+  // Return none if LoopID is false.
+  if (!LoopID)
+    return None;
+
+  // First operand should refer to the loop id itself.
+  assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+  assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+  // Iterate over LoopID operands and look for MDString Metadata
+  for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
+    MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+    if (!MD)
+      continue;
+    MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+    if (!S)
+      continue;
+    // Return true if MDString holds expected MetaData.
+    if (Name.equals(S->getString()))
+      switch (MD->getNumOperands()) {
+      case 1:
+        return nullptr;
+      case 2:
+        return &MD->getOperand(1);
+      default:
+        llvm_unreachable("loop metadata has 0 or 1 operand");
+      }
+  }
+  return None;
+}
+
+/// Returns true if the instruction in a loop is guaranteed to execute at least
+/// once.
+bool llvm::isGuaranteedToExecute(const Instruction &Inst,
+                                 const DominatorTree *DT, const Loop *CurLoop,
+                                 const LoopSafetyInfo *SafetyInfo) {
+  // We have to check to make sure that the instruction dominates all
+  // of the exit blocks.  If it doesn't, then there is a path out of the loop
+  // which does not execute this instruction, so we can't hoist it.
+
+  // If the instruction is in the header block for the loop (which is very
+  // common), it is always guaranteed to dominate the exit blocks.  Since this
+  // is a common case, and can save some work, check it now.
+  if (Inst.getParent() == CurLoop->getHeader())
+    // If there's a throw in the header block, we can't guarantee we'll reach
+    // Inst.
+    return !SafetyInfo->HeaderMayThrow;
+
+  // Somewhere in this loop there is an instruction which may throw and make us
+  // exit the loop.
+  if (SafetyInfo->MayThrow)
+    return false;
+
+  // Get the exit blocks for the current loop.
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  CurLoop->getExitBlocks(ExitBlocks);
+
+  // Verify that the block dominates each of the exit blocks of the loop.
+  for (BasicBlock *ExitBlock : ExitBlocks)
+    if (!DT->dominates(Inst.getParent(), ExitBlock))
+      return false;
+
+  // As a degenerate case, if the loop is statically infinite then we haven't
+  // proven anything since there are no exit blocks.
+  if (ExitBlocks.empty())
+    return false;
+
+  // FIXME: In general, we have to prove that the loop isn't an infinite loop.
+  // See http::llvm.org/PR24078 .  (The "ExitBlocks.empty()" check above is
+  // just a special case of this.)
+  return true;
+}
diff --git a/lib/Transforms/Utils/LoopVersioning.cpp b/lib/Transforms/Utils/LoopVersioning.cpp
index 9a2a06cf68915..b3c61691da30a 100644
--- a/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/lib/Transforms/Utils/LoopVersioning.cpp
@@ -18,11 +18,18 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 
 using namespace llvm;
 
+static cl::opt<bool>
+    AnnotateNoAlias("loop-version-annotate-no-alias", cl::init(true),
+                    cl::Hidden,
+                    cl::desc("Add no-alias annotation for instructions that "
+                             "are disambiguated by memchecks"));
+
 LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI,
                                DominatorTree *DT, ScalarEvolution *SE,
                                bool UseLAIChecks)
@@ -32,12 +39,12 @@ LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI,
   assert(L->getLoopPreheader() && "No preheader");
   if (UseLAIChecks) {
     setAliasChecks(LAI.getRuntimePointerChecking()->getChecks());
-    setSCEVChecks(LAI.PSE.getUnionPredicate());
+    setSCEVChecks(LAI.getPSE().getUnionPredicate());
   }
 }
 
 void LoopVersioning::setAliasChecks(
-    const SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks) {
+    SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks) {
   AliasChecks = std::move(Checks);
 }
 
@@ -56,9 +63,8 @@ void LoopVersioning::versionLoop(
   BasicBlock *RuntimeCheckBB = VersionedLoop->getLoopPreheader();
   std::tie(FirstCheckInst, MemRuntimeCheck) =
       LAI.addRuntimeChecks(RuntimeCheckBB->getTerminator(), AliasChecks);
-  assert(MemRuntimeCheck && "called even though needsAnyChecking = false");
 
-  const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate();
+  const SCEVUnionPredicate &Pred = LAI.getPSE().getUnionPredicate();
   SCEVExpander Exp(*SE, RuntimeCheckBB->getModule()->getDataLayout(),
                    "scev.check");
   SCEVRuntimeCheck =
@@ -71,7 +77,7 @@ void LoopVersioning::versionLoop(
 
   if (MemRuntimeCheck && SCEVRuntimeCheck) {
     RuntimeCheck = BinaryOperator::Create(Instruction::Or, MemRuntimeCheck,
-                                          SCEVRuntimeCheck, "ldist.safe");
+                                          SCEVRuntimeCheck, "lver.safe");
     if (auto *I = dyn_cast<Instruction>(RuntimeCheck))
       I->insertBefore(RuntimeCheckBB->getTerminator());
   } else
@@ -119,16 +125,14 @@ void LoopVersioning::addPHINodes(
     const SmallVectorImpl<Instruction *> &DefsUsedOutside) {
   BasicBlock *PHIBlock = VersionedLoop->getExitBlock();
   assert(PHIBlock && "No single successor to loop exit block");
+  PHINode *PN;
 
+  // First add a single-operand PHI for each DefsUsedOutside if one does not
+  // exists yet.
   for (auto *Inst : DefsUsedOutside) {
-    auto *NonVersionedLoopInst = cast<Instruction>(VMap[Inst]);
-    PHINode *PN;
-
-    // First see if we have a single-operand PHI with the value defined by the
+    // See if we have a single-operand PHI with the value defined by the
     // original loop.
     for (auto I = PHIBlock->begin(); (PN = dyn_cast<PHINode>(I)); ++I) {
-      assert(PN->getNumOperands() == 1 &&
-             "Exit block should only have on predecessor");
       if (PN->getIncomingValue(0) == Inst)
         break;
     }
@@ -141,7 +145,179 @@ void LoopVersioning::addPHINodes(
           User->replaceUsesOfWith(Inst, PN);
       PN->addIncoming(Inst, VersionedLoop->getExitingBlock());
     }
-    // Add the new incoming value from the non-versioned loop.
-    PN->addIncoming(NonVersionedLoopInst, NonVersionedLoop->getExitingBlock());
   }
+
+  // Then for each PHI add the operand for the edge from the cloned loop.
+  for (auto I = PHIBlock->begin(); (PN = dyn_cast<PHINode>(I)); ++I) {
+    assert(PN->getNumOperands() == 1 &&
+           "Exit block should only have on predecessor");
+
+    // If the definition was cloned used that otherwise use the same value.
+    Value *ClonedValue = PN->getIncomingValue(0);
+    auto Mapped = VMap.find(ClonedValue);
+    if (Mapped != VMap.end())
+      ClonedValue = Mapped->second;
+
+    PN->addIncoming(ClonedValue, NonVersionedLoop->getExitingBlock());
+  }
+}
+
+void LoopVersioning::prepareNoAliasMetadata() {
+  // We need to turn the no-alias relation between pointer checking groups into
+  // no-aliasing annotations between instructions.
+  //
+  // We accomplish this by mapping each pointer checking group (a set of
+  // pointers memchecked together) to an alias scope and then also mapping each
+  // group to the list of scopes it can't alias.
+
+  const RuntimePointerChecking *RtPtrChecking = LAI.getRuntimePointerChecking();
+  LLVMContext &Context = VersionedLoop->getHeader()->getContext();
+
+  // First allocate an aliasing scope for each pointer checking group.
+  //
+  // While traversing through the checking groups in the loop, also create a
+  // reverse map from pointers to the pointer checking group they were assigned
+  // to.
+  MDBuilder MDB(Context);
+  MDNode *Domain = MDB.createAnonymousAliasScopeDomain("LVerDomain");
+
+  for (const auto &Group : RtPtrChecking->CheckingGroups) {
+    GroupToScope[&Group] = MDB.createAnonymousAliasScope(Domain);
+
+    for (unsigned PtrIdx : Group.Members)
+      PtrToGroup[RtPtrChecking->getPointerInfo(PtrIdx).PointerValue] = &Group;
+  }
+
+  // Go through the checks and for each pointer group, collect the scopes for
+  // each non-aliasing pointer group.
+  DenseMap<const RuntimePointerChecking::CheckingPtrGroup *,
+           SmallVector<Metadata *, 4>>
+      GroupToNonAliasingScopes;
+
+  for (const auto &Check : AliasChecks)
+    GroupToNonAliasingScopes[Check.first].push_back(GroupToScope[Check.second]);
+
+  // Finally, transform the above to actually map to scope list which is what
+  // the metadata uses.
+
+  for (auto Pair : GroupToNonAliasingScopes)
+    GroupToNonAliasingScopeList[Pair.first] = MDNode::get(Context, Pair.second);
+}
+
+void LoopVersioning::annotateLoopWithNoAlias() {
+  if (!AnnotateNoAlias)
+    return;
+
+  // First prepare the maps.
+  prepareNoAliasMetadata();
+
+  // Add the scope and no-alias metadata to the instructions.
+  for (Instruction *I : LAI.getDepChecker().getMemoryInstructions()) {
+    annotateInstWithNoAlias(I);
+  }
+}
+
+void LoopVersioning::annotateInstWithNoAlias(Instruction *VersionedInst,
+                                             const Instruction *OrigInst) {
+  if (!AnnotateNoAlias)
+    return;
+
+  LLVMContext &Context = VersionedLoop->getHeader()->getContext();
+  const Value *Ptr = isa<LoadInst>(OrigInst)
+                         ? cast<LoadInst>(OrigInst)->getPointerOperand()
+                         : cast<StoreInst>(OrigInst)->getPointerOperand();
+
+  // Find the group for the pointer and then add the scope metadata.
+  auto Group = PtrToGroup.find(Ptr);
+  if (Group != PtrToGroup.end()) {
+    VersionedInst->setMetadata(
+        LLVMContext::MD_alias_scope,
+        MDNode::concatenate(
+            VersionedInst->getMetadata(LLVMContext::MD_alias_scope),
+            MDNode::get(Context, GroupToScope[Group->second])));
+
+    // Add the no-alias metadata.
+    auto NonAliasingScopeList = GroupToNonAliasingScopeList.find(Group->second);
+    if (NonAliasingScopeList != GroupToNonAliasingScopeList.end())
+      VersionedInst->setMetadata(
+          LLVMContext::MD_noalias,
+          MDNode::concatenate(
+              VersionedInst->getMetadata(LLVMContext::MD_noalias),
+              NonAliasingScopeList->second));
+  }
+}
+
+namespace {
+/// \brief Also expose this is a pass.  Currently this is only used for
+/// unit-testing.  It adds all memchecks necessary to remove all may-aliasing
+/// array accesses from the loop.
+class LoopVersioningPass : public FunctionPass {
+public:
+  LoopVersioningPass() : FunctionPass(ID) {
+    initializeLoopVersioningPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
+    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+
+    // Build up a worklist of inner-loops to version. This is necessary as the
+    // act of versioning a loop creates new loops and can invalidate iterators
+    // across the loops.
+    SmallVector<Loop *, 8> Worklist;
+
+    for (Loop *TopLevelLoop : *LI)
+      for (Loop *L : depth_first(TopLevelLoop))
+        // We only handle inner-most loops.
+        if (L->empty())
+          Worklist.push_back(L);
+
+    // Now walk the identified inner loops.
+    bool Changed = false;
+    for (Loop *L : Worklist) {
+      const LoopAccessInfo &LAI = LAA->getInfo(L);
+      if (LAI.getNumRuntimePointerChecks() ||
+          !LAI.getPSE().getUnionPredicate().isAlwaysTrue()) {
+        LoopVersioning LVer(LAI, L, LI, DT, SE);
+        LVer.versionLoop();
+        LVer.annotateLoopWithNoAlias();
+        Changed = true;
+      }
+    }
+
+    return Changed;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequired<LoopAccessLegacyAnalysis>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+  }
+
+  static char ID;
+};
+}
+
+#define LVER_OPTION "loop-versioning"
+#define DEBUG_TYPE LVER_OPTION
+
+char LoopVersioningPass::ID;
+static const char LVer_name[] = "Loop Versioning";
+
+INITIALIZE_PASS_BEGIN(LoopVersioningPass, LVER_OPTION, LVer_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(LoopVersioningPass, LVER_OPTION, LVer_name, false, false)
+
+namespace llvm {
+FunctionPass *createLoopVersioningPass() {
+  return new LoopVersioningPass();
+}
 }
diff --git a/lib/Transforms/Utils/LowerInvoke.cpp b/lib/Transforms/Utils/LowerInvoke.cpp
index b0ad4d5e84a1b..1b31c5ae580a1 100644
--- a/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/lib/Transforms/Utils/LowerInvoke.cpp
@@ -14,14 +14,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Scalar.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "lowerinvoke"
@@ -53,8 +52,8 @@ FunctionPass *llvm::createLowerInvokePass() {
 
 bool LowerInvoke::runOnFunction(Function &F) {
   bool Changed = false;
-  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
-    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
+  for (BasicBlock &BB : F)
+    if (InvokeInst *II = dyn_cast<InvokeInst>(BB.getTerminator())) {
       SmallVector<Value*,16> CallArgs(II->op_begin(), II->op_end() - 3);
       // Insert a normal call instruction...
       CallInst *NewCall = CallInst::Create(II->getCalledValue(),
@@ -69,10 +68,10 @@ bool LowerInvoke::runOnFunction(Function &F) {
       BranchInst::Create(II->getNormalDest(), II);
 
       // Remove any PHI node entries from the exception destination.
-      II->getUnwindDest()->removePredecessor(&*BB);
+      II->getUnwindDest()->removePredecessor(&BB);
 
       // Remove the invoke instruction now.
-      BB->getInstList().erase(II);
+      BB.getInstList().erase(II);
 
       ++NumInvokes; Changed = true;
     }
diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
index 52beb1542497e..5c07469869ff7 100644
--- a/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/lib/Transforms/Utils/LowerSwitch.cpp
@@ -59,12 +59,6 @@ namespace {
 
     bool runOnFunction(Function &F) override;
 
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      // This is a cluster of orthogonal Transforms
-      AU.addPreserved<UnifyFunctionExitNodes>();
-      AU.addPreservedID(LowerInvokePassID);
-    }
-
     struct CaseRange {
       ConstantInt* Low;
       ConstantInt* High;
@@ -192,8 +186,8 @@ static void fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB,
       }
     // Remove incoming values in the reverse order to prevent invalidating
     // *successive* index.
-    for (auto III = Indices.rbegin(), IIE = Indices.rend(); III != IIE; ++III)
-      PN->removeIncomingValue(*III);
+    for (unsigned III : reverse(Indices))
+      PN->removeIncomingValue(III);
   }
 }
 
diff --git a/lib/Transforms/Utils/Makefile b/lib/Transforms/Utils/Makefile
deleted file mode 100644
index d1e9336d67f02..0000000000000
--- a/lib/Transforms/Utils/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Transforms/Utils/Makefile -----------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMTransformUtils
-BUILD_ARCHIVE = 1
-
-include $(LEVEL)/Makefile.common
-
diff --git a/lib/Transforms/Utils/Mem2Reg.cpp b/lib/Transforms/Utils/Mem2Reg.cpp
index aa1e35ddba024..1419254bcb4f4 100644
--- a/lib/Transforms/Utils/Mem2Reg.cpp
+++ b/lib/Transforms/Utils/Mem2Reg.cpp
@@ -12,12 +12,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Mem2Reg.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
 using namespace llvm;
@@ -26,51 +27,11 @@ using namespace llvm;
 
 STATISTIC(NumPromoted, "Number of alloca's promoted");
 
-namespace {
-  struct PromotePass : public FunctionPass {
-    static char ID; // Pass identification, replacement for typeid
-    PromotePass() : FunctionPass(ID) {
-      initializePromotePassPass(*PassRegistry::getPassRegistry());
-    }
-
-    // runOnFunction - To run this pass, first we calculate the alloca
-    // instructions that are safe for promotion, then we promote each one.
-    //
-    bool runOnFunction(Function &F) override;
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<AssumptionCacheTracker>();
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.setPreservesCFG();
-      // This is a cluster of orthogonal Transforms
-      AU.addPreserved<UnifyFunctionExitNodes>();
-      AU.addPreservedID(LowerSwitchID);
-      AU.addPreservedID(LowerInvokePassID);
-    }
-  };
-}  // end of anonymous namespace
-
-char PromotePass::ID = 0;
-INITIALIZE_PASS_BEGIN(PromotePass, "mem2reg", "Promote Memory to Register",
-                false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(PromotePass, "mem2reg", "Promote Memory to Register",
-                false, false)
-
-bool PromotePass::runOnFunction(Function &F) {
-  std::vector<AllocaInst*> Allocas;
-
-  BasicBlock &BB = F.getEntryBlock();  // Get the entry node for the function
-
-  if (F.hasFnAttribute(Attribute::OptimizeNone))
-    return false;
-
-  bool Changed  = false;
-
-  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  AssumptionCache &AC =
-      getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+static bool promoteMemoryToRegister(Function &F, DominatorTree &DT,
+                                    AssumptionCache &AC) {
+  std::vector<AllocaInst *> Allocas;
+  BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function
+  bool Changed = false;
 
   while (1) {
     Allocas.clear();
@@ -78,22 +39,69 @@ bool PromotePass::runOnFunction(Function &F) {
     // Find allocas that are safe to promote, by looking at all instructions in
     // the entry node
     for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
-      if (AllocaInst *AI = dyn_cast<AllocaInst>(I))       // Is it an alloca?
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) // Is it an alloca?
         if (isAllocaPromotable(AI))
           Allocas.push_back(AI);
 
-    if (Allocas.empty()) break;
+    if (Allocas.empty())
+      break;
 
     PromoteMemToReg(Allocas, DT, nullptr, &AC);
     NumPromoted += Allocas.size();
     Changed = true;
   }
-
   return Changed;
 }
 
+PreservedAnalyses PromotePass::run(Function &F, AnalysisManager<Function> &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  if (!promoteMemoryToRegister(F, DT, AC))
+    return PreservedAnalyses::all();
+
+  // FIXME: This should also 'preserve the CFG'.
+  return PreservedAnalyses::none();
+}
+
+namespace {
+struct PromoteLegacyPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  PromoteLegacyPass() : FunctionPass(ID) {
+    initializePromoteLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  // runOnFunction - To run this pass, first we calculate the alloca
+  // instructions that are safe for promotion, then we promote each one.
+  //
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    AssumptionCache &AC =
+        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    return promoteMemoryToRegister(F, DT, AC);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.setPreservesCFG();
+  }
+  };
+}  // end of anonymous namespace
+
+char PromoteLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(PromoteLegacyPass, "mem2reg", "Promote Memory to "
+                                                    "Register",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(PromoteLegacyPass, "mem2reg", "Promote Memory to Register",
+                    false, false)
+
 // createPromoteMemoryToRegister - Provide an entry point to create this pass.
 //
 FunctionPass *llvm::createPromoteMemoryToRegisterPass() {
-  return new PromotePass();
+  return new PromoteLegacyPass();
 }
diff --git a/lib/Transforms/Utils/MemorySSA.cpp b/lib/Transforms/Utils/MemorySSA.cpp
new file mode 100644
index 0000000000000..8ba3cae43b188
--- /dev/null
+++ b/lib/Transforms/Utils/MemorySSA.cpp
@@ -0,0 +1,1361 @@
+//===-- MemorySSA.cpp - Memory SSA Builder---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------===//
+//
+// This file implements the MemorySSA class.
+//
+//===----------------------------------------------------------------===//
+#include "llvm/Transforms/Utils/MemorySSA.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/PHITransAddr.h"
+#include "llvm/IR/AssemblyAnnotationWriter.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Transforms/Scalar.h"
+#include <algorithm>
+
+#define DEBUG_TYPE "memoryssa"
+using namespace llvm;
+STATISTIC(NumClobberCacheLookups, "Number of Memory SSA version cache lookups");
+STATISTIC(NumClobberCacheHits, "Number of Memory SSA version cache hits");
+STATISTIC(NumClobberCacheInserts, "Number of MemorySSA version cache inserts");
+
+INITIALIZE_PASS_BEGIN(MemorySSAWrapperPass, "memoryssa", "Memory SSA", false,
+                      true)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(MemorySSAWrapperPass, "memoryssa", "Memory SSA", false,
+                    true)
+
+INITIALIZE_PASS_BEGIN(MemorySSAPrinterLegacyPass, "print-memoryssa",
+                      "Memory SSA Printer", false, false)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_END(MemorySSAPrinterLegacyPass, "print-memoryssa",
+                    "Memory SSA Printer", false, false)
+
+static cl::opt<bool>
+    VerifyMemorySSA("verify-memoryssa", cl::init(false), cl::Hidden,
+                    cl::desc("Verify MemorySSA in legacy printer pass."));
+
+namespace llvm {
+/// \brief An assembly annotator class to print Memory SSA information in
+/// comments.
+class MemorySSAAnnotatedWriter : public AssemblyAnnotationWriter {
+  friend class MemorySSA;
+  const MemorySSA *MSSA;
+
+public:
+  MemorySSAAnnotatedWriter(const MemorySSA *M) : MSSA(M) {}
+
+  virtual void emitBasicBlockStartAnnot(const BasicBlock *BB,
+                                        formatted_raw_ostream &OS) {
+    if (MemoryAccess *MA = MSSA->getMemoryAccess(BB))
+      OS << "; " << *MA << "\n";
+  }
+
+  virtual void emitInstructionAnnot(const Instruction *I,
+                                    formatted_raw_ostream &OS) {
+    if (MemoryAccess *MA = MSSA->getMemoryAccess(I))
+      OS << "; " << *MA << "\n";
+  }
+};
+
+/// \brief A MemorySSAWalker that does AA walks and caching of lookups to
+/// disambiguate accesses.
+///
+/// FIXME: The current implementation of this can take quadratic space in rare
+/// cases. This can be fixed, but it is something to note until it is fixed.
+///
+/// In order to trigger this behavior, you need to store to N distinct locations
+/// (that AA can prove don't alias), perform M stores to other memory
+/// locations that AA can prove don't alias any of the initial N locations, and
+/// then load from all of the N locations. In this case, we insert M cache
+/// entries for each of the N loads.
+///
+/// For example:
+/// define i32 @foo() {
+///   %a = alloca i32, align 4
+///   %b = alloca i32, align 4
+///   store i32 0, i32* %a, align 4
+///   store i32 0, i32* %b, align 4
+///
+///   ; Insert M stores to other memory that doesn't alias %a or %b here
+///
+///   %c = load i32, i32* %a, align 4 ; Caches M entries in
+///                                   ; CachedUpwardsClobberingAccess for the
+///                                   ; MemoryLocation %a
+///   %d = load i32, i32* %b, align 4 ; Caches M entries in
+///                                   ; CachedUpwardsClobberingAccess for the
+///                                   ; MemoryLocation %b
+///
+///   ; For completeness' sake, loading %a or %b again would not cache *another*
+///   ; M entries.
+///   %r = add i32 %c, %d
+///   ret i32 %r
+/// }
+class MemorySSA::CachingWalker final : public MemorySSAWalker {
+public:
+  CachingWalker(MemorySSA *, AliasAnalysis *, DominatorTree *);
+  ~CachingWalker() override;
+
+  MemoryAccess *getClobberingMemoryAccess(const Instruction *) override;
+  MemoryAccess *getClobberingMemoryAccess(MemoryAccess *,
+                                          MemoryLocation &) override;
+  void invalidateInfo(MemoryAccess *) override;
+
+protected:
+  struct UpwardsMemoryQuery;
+  MemoryAccess *doCacheLookup(const MemoryAccess *, const UpwardsMemoryQuery &,
+                              const MemoryLocation &);
+
+  void doCacheInsert(const MemoryAccess *, MemoryAccess *,
+                     const UpwardsMemoryQuery &, const MemoryLocation &);
+
+  void doCacheRemove(const MemoryAccess *, const UpwardsMemoryQuery &,
+                     const MemoryLocation &);
+
+private:
+  MemoryAccessPair UpwardsDFSWalk(MemoryAccess *, const MemoryLocation &,
+                                  UpwardsMemoryQuery &, bool);
+  MemoryAccess *getClobberingMemoryAccess(MemoryAccess *, UpwardsMemoryQuery &);
+  bool instructionClobbersQuery(const MemoryDef *, UpwardsMemoryQuery &,
+                                const MemoryLocation &Loc) const;
+  void verifyRemoved(MemoryAccess *);
+  SmallDenseMap<ConstMemoryAccessPair, MemoryAccess *>
+      CachedUpwardsClobberingAccess;
+  DenseMap<const MemoryAccess *, MemoryAccess *> CachedUpwardsClobberingCall;
+  AliasAnalysis *AA;
+  DominatorTree *DT;
+};
+}
+
+namespace {
+struct RenamePassData {
+  DomTreeNode *DTN;
+  DomTreeNode::const_iterator ChildIt;
+  MemoryAccess *IncomingVal;
+
+  RenamePassData(DomTreeNode *D, DomTreeNode::const_iterator It,
+                 MemoryAccess *M)
+      : DTN(D), ChildIt(It), IncomingVal(M) {}
+  void swap(RenamePassData &RHS) {
+    std::swap(DTN, RHS.DTN);
+    std::swap(ChildIt, RHS.ChildIt);
+    std::swap(IncomingVal, RHS.IncomingVal);
+  }
+};
+}
+
+namespace llvm {
+/// \brief Rename a single basic block into MemorySSA form.
+/// Uses the standard SSA renaming algorithm.
+/// \returns The new incoming value.
+MemoryAccess *MemorySSA::renameBlock(BasicBlock *BB,
+                                     MemoryAccess *IncomingVal) {
+  auto It = PerBlockAccesses.find(BB);
+  // Skip most processing if the list is empty.
+  if (It != PerBlockAccesses.end()) {
+    AccessList *Accesses = It->second.get();
+    for (MemoryAccess &L : *Accesses) {
+      switch (L.getValueID()) {
+      case Value::MemoryUseVal:
+        cast<MemoryUse>(&L)->setDefiningAccess(IncomingVal);
+        break;
+      case Value::MemoryDefVal:
+        // We can't legally optimize defs, because we only allow single
+        // memory phis/uses on operations, and if we optimize these, we can
+        // end up with multiple reaching defs. Uses do not have this
+        // problem, since they do not produce a value
+        cast<MemoryDef>(&L)->setDefiningAccess(IncomingVal);
+        IncomingVal = &L;
+        break;
+      case Value::MemoryPhiVal:
+        IncomingVal = &L;
+        break;
+      }
+    }
+  }
+
+  // Pass through values to our successors
+  for (const BasicBlock *S : successors(BB)) {
+    auto It = PerBlockAccesses.find(S);
+    // Rename the phi nodes in our successor block
+    if (It == PerBlockAccesses.end() || !isa<MemoryPhi>(It->second->front()))
+      continue;
+    AccessList *Accesses = It->second.get();
+    auto *Phi = cast<MemoryPhi>(&Accesses->front());
+    Phi->addIncoming(IncomingVal, BB);
+  }
+
+  return IncomingVal;
+}
+
+/// \brief This is the standard SSA renaming algorithm.
+///
+/// We walk the dominator tree in preorder, renaming accesses, and then filling
+/// in phi nodes in our successors.
+void MemorySSA::renamePass(DomTreeNode *Root, MemoryAccess *IncomingVal,
+                           SmallPtrSet<BasicBlock *, 16> &Visited) {
+  SmallVector<RenamePassData, 32> WorkStack;
+  IncomingVal = renameBlock(Root->getBlock(), IncomingVal);
+  WorkStack.push_back({Root, Root->begin(), IncomingVal});
+  Visited.insert(Root->getBlock());
+
+  while (!WorkStack.empty()) {
+    DomTreeNode *Node = WorkStack.back().DTN;
+    DomTreeNode::const_iterator ChildIt = WorkStack.back().ChildIt;
+    IncomingVal = WorkStack.back().IncomingVal;
+
+    if (ChildIt == Node->end()) {
+      WorkStack.pop_back();
+    } else {
+      DomTreeNode *Child = *ChildIt;
+      ++WorkStack.back().ChildIt;
+      BasicBlock *BB = Child->getBlock();
+      Visited.insert(BB);
+      IncomingVal = renameBlock(BB, IncomingVal);
+      WorkStack.push_back({Child, Child->begin(), IncomingVal});
+    }
+  }
+}
+
+/// \brief Compute dominator levels, used by the phi insertion algorithm above.
+void MemorySSA::computeDomLevels(DenseMap<DomTreeNode *, unsigned> &DomLevels) {
+  for (auto DFI = df_begin(DT->getRootNode()), DFE = df_end(DT->getRootNode());
+       DFI != DFE; ++DFI)
+    DomLevels[*DFI] = DFI.getPathLength() - 1;
+}
+
+/// \brief This handles unreachable block accesses by deleting phi nodes in
+/// unreachable blocks, and marking all other unreachable MemoryAccess's as
+/// being uses of the live on entry definition.
+void MemorySSA::markUnreachableAsLiveOnEntry(BasicBlock *BB) {
+  assert(!DT->isReachableFromEntry(BB) &&
+         "Reachable block found while handling unreachable blocks");
+
+  // Make sure phi nodes in our reachable successors end up with a
+  // LiveOnEntryDef for our incoming edge, even though our block is forward
+  // unreachable.  We could just disconnect these blocks from the CFG fully,
+  // but we do not right now.
+  for (const BasicBlock *S : successors(BB)) {
+    if (!DT->isReachableFromEntry(S))
+      continue;
+    auto It = PerBlockAccesses.find(S);
+    // Rename the phi nodes in our successor block
+    if (It == PerBlockAccesses.end() || !isa<MemoryPhi>(It->second->front()))
+      continue;
+    AccessList *Accesses = It->second.get();
+    auto *Phi = cast<MemoryPhi>(&Accesses->front());
+    Phi->addIncoming(LiveOnEntryDef.get(), BB);
+  }
+
+  auto It = PerBlockAccesses.find(BB);
+  if (It == PerBlockAccesses.end())
+    return;
+
+  auto &Accesses = It->second;
+  for (auto AI = Accesses->begin(), AE = Accesses->end(); AI != AE;) {
+    auto Next = std::next(AI);
+    // If we have a phi, just remove it. We are going to replace all
+    // users with live on entry.
+    if (auto *UseOrDef = dyn_cast<MemoryUseOrDef>(AI))
+      UseOrDef->setDefiningAccess(LiveOnEntryDef.get());
+    else
+      Accesses->erase(AI);
+    AI = Next;
+  }
+}
+
+MemorySSA::MemorySSA(Function &Func, AliasAnalysis *AA, DominatorTree *DT)
+    : AA(AA), DT(DT), F(Func), LiveOnEntryDef(nullptr), Walker(nullptr),
+      NextID(0) {
+  buildMemorySSA();
+}
+
+MemorySSA::MemorySSA(MemorySSA &&MSSA)
+    : AA(MSSA.AA), DT(MSSA.DT), F(MSSA.F),
+      ValueToMemoryAccess(std::move(MSSA.ValueToMemoryAccess)),
+      PerBlockAccesses(std::move(MSSA.PerBlockAccesses)),
+      LiveOnEntryDef(std::move(MSSA.LiveOnEntryDef)),
+      Walker(std::move(MSSA.Walker)), NextID(MSSA.NextID) {
+  // Update the Walker MSSA pointer so it doesn't point to the moved-from MSSA
+  // object any more.
+  Walker->MSSA = this;
+}
+
+MemorySSA::~MemorySSA() {
+  // Drop all our references
+  for (const auto &Pair : PerBlockAccesses)
+    for (MemoryAccess &MA : *Pair.second)
+      MA.dropAllReferences();
+}
+
+MemorySSA::AccessList *MemorySSA::getOrCreateAccessList(const BasicBlock *BB) {
+  auto Res = PerBlockAccesses.insert(std::make_pair(BB, nullptr));
+
+  if (Res.second)
+    Res.first->second = make_unique<AccessList>();
+  return Res.first->second.get();
+}
+
+void MemorySSA::buildMemorySSA() {
+  // We create an access to represent "live on entry", for things like
+  // arguments or users of globals, where the memory they use is defined before
+  // the beginning of the function. We do not actually insert it into the IR.
+  // We do not define a live on exit for the immediate uses, and thus our
+  // semantics do *not* imply that something with no immediate uses can simply
+  // be removed.
+  BasicBlock &StartingPoint = F.getEntryBlock();
+  LiveOnEntryDef = make_unique<MemoryDef>(F.getContext(), nullptr, nullptr,
+                                          &StartingPoint, NextID++);
+
+  // We maintain lists of memory accesses per-block, trading memory for time. We
+  // could just look up the memory access for every possible instruction in the
+  // stream.
+  SmallPtrSet<BasicBlock *, 32> DefiningBlocks;
+  SmallPtrSet<BasicBlock *, 32> DefUseBlocks;
+  // Go through each block, figure out where defs occur, and chain together all
+  // the accesses.
+  for (BasicBlock &B : F) {
+    bool InsertIntoDef = false;
+    AccessList *Accesses = nullptr;
+    for (Instruction &I : B) {
+      MemoryUseOrDef *MUD = createNewAccess(&I);
+      if (!MUD)
+        continue;
+      InsertIntoDef |= isa<MemoryDef>(MUD);
+
+      if (!Accesses)
+        Accesses = getOrCreateAccessList(&B);
+      Accesses->push_back(MUD);
+    }
+    if (InsertIntoDef)
+      DefiningBlocks.insert(&B);
+    if (Accesses)
+      DefUseBlocks.insert(&B);
+  }
+
+  // Compute live-in.
+  // Live in is normally defined as "all the blocks on the path from each def to
+  // each of it's uses".
+  // MemoryDef's are implicit uses of previous state, so they are also uses.
+  // This means we don't really have def-only instructions.  The only
+  // MemoryDef's that are not really uses are those that are of the LiveOnEntry
+  // variable (because LiveOnEntry can reach anywhere, and every def is a
+  // must-kill of LiveOnEntry).
+  // In theory, you could precisely compute live-in by using alias-analysis to
+  // disambiguate defs and uses to see which really pair up with which.
+  // In practice, this would be really expensive and difficult. So we simply
+  // assume all defs are also uses that need to be kept live.
+  // Because of this, the end result of this live-in computation will be "the
+  // entire set of basic blocks that reach any use".
+
+  SmallPtrSet<BasicBlock *, 32> LiveInBlocks;
+  SmallVector<BasicBlock *, 64> LiveInBlockWorklist(DefUseBlocks.begin(),
+                                                    DefUseBlocks.end());
+  // Now that we have a set of blocks where a value is live-in, recursively add
+  // predecessors until we find the full region the value is live.
+  while (!LiveInBlockWorklist.empty()) {
+    BasicBlock *BB = LiveInBlockWorklist.pop_back_val();
+
+    // The block really is live in here, insert it into the set.  If already in
+    // the set, then it has already been processed.
+    if (!LiveInBlocks.insert(BB).second)
+      continue;
+
+    // Since the value is live into BB, it is either defined in a predecessor or
+    // live into it to.
+    LiveInBlockWorklist.append(pred_begin(BB), pred_end(BB));
+  }
+
+  // Determine where our MemoryPhi's should go
+  ForwardIDFCalculator IDFs(*DT);
+  IDFs.setDefiningBlocks(DefiningBlocks);
+  IDFs.setLiveInBlocks(LiveInBlocks);
+  SmallVector<BasicBlock *, 32> IDFBlocks;
+  IDFs.calculate(IDFBlocks);
+
+  // Now place MemoryPhi nodes.
+  for (auto &BB : IDFBlocks) {
+    // Insert phi node
+    AccessList *Accesses = getOrCreateAccessList(BB);
+    MemoryPhi *Phi = new MemoryPhi(BB->getContext(), BB, NextID++);
+    ValueToMemoryAccess.insert(std::make_pair(BB, Phi));
+    // Phi's always are placed at the front of the block.
+    Accesses->push_front(Phi);
+  }
+
+  // Now do regular SSA renaming on the MemoryDef/MemoryUse. Visited will get
+  // filled in with all blocks.
+  SmallPtrSet<BasicBlock *, 16> Visited;
+  renamePass(DT->getRootNode(), LiveOnEntryDef.get(), Visited);
+
+  MemorySSAWalker *Walker = getWalker();
+
+  // Now optimize the MemoryUse's defining access to point to the nearest
+  // dominating clobbering def.
+  // This ensures that MemoryUse's that are killed by the same store are
+  // immediate users of that store, one of the invariants we guarantee.
+  for (auto DomNode : depth_first(DT)) {
+    BasicBlock *BB = DomNode->getBlock();
+    auto AI = PerBlockAccesses.find(BB);
+    if (AI == PerBlockAccesses.end())
+      continue;
+    AccessList *Accesses = AI->second.get();
+    for (auto &MA : *Accesses) {
+      if (auto *MU = dyn_cast<MemoryUse>(&MA)) {
+        Instruction *Inst = MU->getMemoryInst();
+        MU->setDefiningAccess(Walker->getClobberingMemoryAccess(Inst));
+      }
+    }
+  }
+
+  // Mark the uses in unreachable blocks as live on entry, so that they go
+  // somewhere.
+  for (auto &BB : F)
+    if (!Visited.count(&BB))
+      markUnreachableAsLiveOnEntry(&BB);
+}
+
+MemorySSAWalker *MemorySSA::getWalker() {
+  if (Walker)
+    return Walker.get();
+
+  Walker = make_unique<CachingWalker>(this, AA, DT);
+  return Walker.get();
+}
+
+MemoryPhi *MemorySSA::createMemoryPhi(BasicBlock *BB) {
+  assert(!getMemoryAccess(BB) && "MemoryPhi already exists for this BB");
+  AccessList *Accesses = getOrCreateAccessList(BB);
+  MemoryPhi *Phi = new MemoryPhi(BB->getContext(), BB, NextID++);
+  ValueToMemoryAccess.insert(std::make_pair(BB, Phi));
+  // Phi's always are placed at the front of the block.
+  Accesses->push_front(Phi);
+  return Phi;
+}
+
+MemoryUseOrDef *MemorySSA::createDefinedAccess(Instruction *I,
+                                               MemoryAccess *Definition) {
+  assert(!isa<PHINode>(I) && "Cannot create a defined access for a PHI");
+  MemoryUseOrDef *NewAccess = createNewAccess(I);
+  assert(
+      NewAccess != nullptr &&
+      "Tried to create a memory access for a non-memory touching instruction");
+  NewAccess->setDefiningAccess(Definition);
+  return NewAccess;
+}
+
+MemoryAccess *MemorySSA::createMemoryAccessInBB(Instruction *I,
+                                                MemoryAccess *Definition,
+                                                const BasicBlock *BB,
+                                                InsertionPlace Point) {
+  MemoryUseOrDef *NewAccess = createDefinedAccess(I, Definition);
+  auto *Accesses = getOrCreateAccessList(BB);
+  if (Point == Beginning) {
+    // It goes after any phi nodes
+    auto AI = std::find_if(
+        Accesses->begin(), Accesses->end(),
+        [](const MemoryAccess &MA) { return !isa<MemoryPhi>(MA); });
+
+    Accesses->insert(AI, NewAccess);
+  } else {
+    Accesses->push_back(NewAccess);
+  }
+
+  return NewAccess;
+}
+MemoryAccess *MemorySSA::createMemoryAccessBefore(Instruction *I,
+                                                  MemoryAccess *Definition,
+                                                  MemoryAccess *InsertPt) {
+  assert(I->getParent() == InsertPt->getBlock() &&
+         "New and old access must be in the same block");
+  MemoryUseOrDef *NewAccess = createDefinedAccess(I, Definition);
+  auto *Accesses = getOrCreateAccessList(InsertPt->getBlock());
+  Accesses->insert(AccessList::iterator(InsertPt), NewAccess);
+  return NewAccess;
+}
+
+MemoryAccess *MemorySSA::createMemoryAccessAfter(Instruction *I,
+                                                 MemoryAccess *Definition,
+                                                 MemoryAccess *InsertPt) {
+  assert(I->getParent() == InsertPt->getBlock() &&
+         "New and old access must be in the same block");
+  MemoryUseOrDef *NewAccess = createDefinedAccess(I, Definition);
+  auto *Accesses = getOrCreateAccessList(InsertPt->getBlock());
+  Accesses->insertAfter(AccessList::iterator(InsertPt), NewAccess);
+  return NewAccess;
+}
+
+/// \brief Helper function to create new memory accesses
+MemoryUseOrDef *MemorySSA::createNewAccess(Instruction *I) {
+  // The assume intrinsic has a control dependency which we model by claiming
+  // that it writes arbitrarily. Ignore that fake memory dependency here.
+  // FIXME: Replace this special casing with a more accurate modelling of
+  // assume's control dependency.
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+    if (II->getIntrinsicID() == Intrinsic::assume)
+      return nullptr;
+
+  // Find out what affect this instruction has on memory.
+  ModRefInfo ModRef = AA->getModRefInfo(I);
+  bool Def = bool(ModRef & MRI_Mod);
+  bool Use = bool(ModRef & MRI_Ref);
+
+  // It's possible for an instruction to not modify memory at all. During
+  // construction, we ignore them.
+  if (!Def && !Use)
+    return nullptr;
+
+  assert((Def || Use) &&
+         "Trying to create a memory access with a non-memory instruction");
+
+  MemoryUseOrDef *MUD;
+  if (Def)
+    MUD = new MemoryDef(I->getContext(), nullptr, I, I->getParent(), NextID++);
+  else
+    MUD = new MemoryUse(I->getContext(), nullptr, I, I->getParent());
+  ValueToMemoryAccess.insert(std::make_pair(I, MUD));
+  return MUD;
+}
+
+MemoryAccess *MemorySSA::findDominatingDef(BasicBlock *UseBlock,
+                                           enum InsertionPlace Where) {
+  // Handle the initial case
+  if (Where == Beginning)
+    // The only thing that could define us at the beginning is a phi node
+    if (MemoryPhi *Phi = getMemoryAccess(UseBlock))
+      return Phi;
+
+  DomTreeNode *CurrNode = DT->getNode(UseBlock);
+  // Need to be defined by our dominator
+  if (Where == Beginning)
+    CurrNode = CurrNode->getIDom();
+  Where = End;
+  while (CurrNode) {
+    auto It = PerBlockAccesses.find(CurrNode->getBlock());
+    if (It != PerBlockAccesses.end()) {
+      auto &Accesses = It->second;
+      for (MemoryAccess &RA : reverse(*Accesses)) {
+        if (isa<MemoryDef>(RA) || isa<MemoryPhi>(RA))
+          return &RA;
+      }
+    }
+    CurrNode = CurrNode->getIDom();
+  }
+  return LiveOnEntryDef.get();
+}
+
+/// \brief Returns true if \p Replacer dominates \p Replacee .
+bool MemorySSA::dominatesUse(const MemoryAccess *Replacer,
+                             const MemoryAccess *Replacee) const {
+  if (isa<MemoryUseOrDef>(Replacee))
+    return DT->dominates(Replacer->getBlock(), Replacee->getBlock());
+  const auto *MP = cast<MemoryPhi>(Replacee);
+  // For a phi node, the use occurs in the predecessor block of the phi node.
+  // Since we may occur multiple times in the phi node, we have to check each
+  // operand to ensure Replacer dominates each operand where Replacee occurs.
+  for (const Use &Arg : MP->operands()) {
+    if (Arg.get() != Replacee &&
+        !DT->dominates(Replacer->getBlock(), MP->getIncomingBlock(Arg)))
+      return false;
+  }
+  return true;
+}
+
+/// \brief If all arguments of a MemoryPHI are defined by the same incoming
+/// argument, return that argument.
+static MemoryAccess *onlySingleValue(MemoryPhi *MP) {
+  MemoryAccess *MA = nullptr;
+
+  for (auto &Arg : MP->operands()) {
+    if (!MA)
+      MA = cast<MemoryAccess>(Arg);
+    else if (MA != Arg)
+      return nullptr;
+  }
+  return MA;
+}
+
+/// \brief Properly remove \p MA from all of MemorySSA's lookup tables.
+///
+/// Because of the way the intrusive list and use lists work, it is important to
+/// do removal in the right order.
+void MemorySSA::removeFromLookups(MemoryAccess *MA) {
+  assert(MA->use_empty() &&
+         "Trying to remove memory access that still has uses");
+  if (MemoryUseOrDef *MUD = dyn_cast<MemoryUseOrDef>(MA))
+    MUD->setDefiningAccess(nullptr);
+  // Invalidate our walker's cache if necessary
+  if (!isa<MemoryUse>(MA))
+    Walker->invalidateInfo(MA);
+  // The call below to erase will destroy MA, so we can't change the order we
+  // are doing things here
+  Value *MemoryInst;
+  if (MemoryUseOrDef *MUD = dyn_cast<MemoryUseOrDef>(MA)) {
+    MemoryInst = MUD->getMemoryInst();
+  } else {
+    MemoryInst = MA->getBlock();
+  }
+  ValueToMemoryAccess.erase(MemoryInst);
+
+  auto AccessIt = PerBlockAccesses.find(MA->getBlock());
+  std::unique_ptr<AccessList> &Accesses = AccessIt->second;
+  Accesses->erase(MA);
+  if (Accesses->empty())
+    PerBlockAccesses.erase(AccessIt);
+}
+
+void MemorySSA::removeMemoryAccess(MemoryAccess *MA) {
+  assert(!isLiveOnEntryDef(MA) && "Trying to remove the live on entry def");
+  // We can only delete phi nodes if they have no uses, or we can replace all
+  // uses with a single definition.
+  MemoryAccess *NewDefTarget = nullptr;
+  if (MemoryPhi *MP = dyn_cast<MemoryPhi>(MA)) {
+    // Note that it is sufficient to know that all edges of the phi node have
+    // the same argument.  If they do, by the definition of dominance frontiers
+    // (which we used to place this phi), that argument must dominate this phi,
+    // and thus, must dominate the phi's uses, and so we will not hit the assert
+    // below.
+    NewDefTarget = onlySingleValue(MP);
+    assert((NewDefTarget || MP->use_empty()) &&
+           "We can't delete this memory phi");
+  } else {
+    NewDefTarget = cast<MemoryUseOrDef>(MA)->getDefiningAccess();
+  }
+
+  // Re-point the uses at our defining access
+  if (!MA->use_empty())
+    MA->replaceAllUsesWith(NewDefTarget);
+
+  // The call below to erase will destroy MA, so we can't change the order we
+  // are doing things here
+  removeFromLookups(MA);
+}
+
+void MemorySSA::print(raw_ostream &OS) const {
+  MemorySSAAnnotatedWriter Writer(this);
+  F.print(OS, &Writer);
+}
+
+void MemorySSA::dump() const {
+  MemorySSAAnnotatedWriter Writer(this);
+  F.print(dbgs(), &Writer);
+}
+
+void MemorySSA::verifyMemorySSA() const {
+  verifyDefUses(F);
+  verifyDomination(F);
+  verifyOrdering(F);
+}
+
+/// \brief Verify that the order and existence of MemoryAccesses matches the
+/// order and existence of memory affecting instructions.
+void MemorySSA::verifyOrdering(Function &F) const {
+  // Walk all the blocks, comparing what the lookups think and what the access
+  // lists think, as well as the order in the blocks vs the order in the access
+  // lists.
+  SmallVector<MemoryAccess *, 32> ActualAccesses;
+  for (BasicBlock &B : F) {
+    const AccessList *AL = getBlockAccesses(&B);
+    MemoryAccess *Phi = getMemoryAccess(&B);
+    if (Phi)
+      ActualAccesses.push_back(Phi);
+    for (Instruction &I : B) {
+      MemoryAccess *MA = getMemoryAccess(&I);
+      assert((!MA || AL) && "We have memory affecting instructions "
+                            "in this block but they are not in the "
+                            "access list");
+      if (MA)
+        ActualAccesses.push_back(MA);
+    }
+    // Either we hit the assert, really have no accesses, or we have both
+    // accesses and an access list
+    if (!AL)
+      continue;
+    assert(AL->size() == ActualAccesses.size() &&
+           "We don't have the same number of accesses in the block as on the "
+           "access list");
+    auto ALI = AL->begin();
+    auto AAI = ActualAccesses.begin();
+    while (ALI != AL->end() && AAI != ActualAccesses.end()) {
+      assert(&*ALI == *AAI && "Not the same accesses in the same order");
+      ++ALI;
+      ++AAI;
+    }
+    ActualAccesses.clear();
+  }
+}
+
+/// \brief Verify the domination properties of MemorySSA by checking that each
+/// definition dominates all of its uses.
+void MemorySSA::verifyDomination(Function &F) const {
+  for (BasicBlock &B : F) {
+    // Phi nodes are attached to basic blocks
+    if (MemoryPhi *MP = getMemoryAccess(&B)) {
+      for (User *U : MP->users()) {
+        BasicBlock *UseBlock;
+        // Phi operands are used on edges, we simulate the right domination by
+        // acting as if the use occurred at the end of the predecessor block.
+        if (MemoryPhi *P = dyn_cast<MemoryPhi>(U)) {
+          for (const auto &Arg : P->operands()) {
+            if (Arg == MP) {
+              UseBlock = P->getIncomingBlock(Arg);
+              break;
+            }
+          }
+        } else {
+          UseBlock = cast<MemoryAccess>(U)->getBlock();
+        }
+        (void)UseBlock;
+        assert(DT->dominates(MP->getBlock(), UseBlock) &&
+               "Memory PHI does not dominate it's uses");
+      }
+    }
+
+    for (Instruction &I : B) {
+      MemoryAccess *MD = dyn_cast_or_null<MemoryDef>(getMemoryAccess(&I));
+      if (!MD)
+        continue;
+
+      for (User *U : MD->users()) {
+        BasicBlock *UseBlock;
+        (void)UseBlock;
+        // Things are allowed to flow to phi nodes over their predecessor edge.
+        if (auto *P = dyn_cast<MemoryPhi>(U)) {
+          for (const auto &Arg : P->operands()) {
+            if (Arg == MD) {
+              UseBlock = P->getIncomingBlock(Arg);
+              break;
+            }
+          }
+        } else {
+          UseBlock = cast<MemoryAccess>(U)->getBlock();
+        }
+        assert(DT->dominates(MD->getBlock(), UseBlock) &&
+               "Memory Def does not dominate it's uses");
+      }
+    }
+  }
+}
+
+/// \brief Verify the def-use lists in MemorySSA, by verifying that \p Use
+/// appears in the use list of \p Def.
+///
+/// llvm_unreachable is used instead of asserts because this may be called in
+/// a build without asserts. In that case, we don't want this to turn into a
+/// nop.
+void MemorySSA::verifyUseInDefs(MemoryAccess *Def, MemoryAccess *Use) const {
+  // The live on entry use may cause us to get a NULL def here
+  if (!Def) {
+    if (!isLiveOnEntryDef(Use))
+      llvm_unreachable("Null def but use not point to live on entry def");
+  } else if (std::find(Def->user_begin(), Def->user_end(), Use) ==
+             Def->user_end()) {
+    llvm_unreachable("Did not find use in def's use list");
+  }
+}
+
+/// \brief Verify the immediate use information, by walking all the memory
+/// accesses and verifying that, for each use, it appears in the
+/// appropriate def's use list
+void MemorySSA::verifyDefUses(Function &F) const {
+  for (BasicBlock &B : F) {
+    // Phi nodes are attached to basic blocks
+    if (MemoryPhi *Phi = getMemoryAccess(&B)) {
+      assert(Phi->getNumOperands() == static_cast<unsigned>(std::distance(
+                                          pred_begin(&B), pred_end(&B))) &&
+             "Incomplete MemoryPhi Node");
+      for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I)
+        verifyUseInDefs(Phi->getIncomingValue(I), Phi);
+    }
+
+    for (Instruction &I : B) {
+      if (MemoryAccess *MA = getMemoryAccess(&I)) {
+        assert(isa<MemoryUseOrDef>(MA) &&
+               "Found a phi node not attached to a bb");
+        verifyUseInDefs(cast<MemoryUseOrDef>(MA)->getDefiningAccess(), MA);
+      }
+    }
+  }
+}
+
+MemoryAccess *MemorySSA::getMemoryAccess(const Value *I) const {
+  return ValueToMemoryAccess.lookup(I);
+}
+
+MemoryPhi *MemorySSA::getMemoryAccess(const BasicBlock *BB) const {
+  return cast_or_null<MemoryPhi>(getMemoryAccess((const Value *)BB));
+}
+
+/// \brief Determine, for two memory accesses in the same block,
+/// whether \p Dominator dominates \p Dominatee.
+/// \returns True if \p Dominator dominates \p Dominatee.
+bool MemorySSA::locallyDominates(const MemoryAccess *Dominator,
+                                 const MemoryAccess *Dominatee) const {
+
+  assert((Dominator->getBlock() == Dominatee->getBlock()) &&
+         "Asking for local domination when accesses are in different blocks!");
+
+  // A node dominates itself.
+  if (Dominatee == Dominator)
+    return true;
+
+  // When Dominatee is defined on function entry, it is not dominated by another
+  // memory access.
+  if (isLiveOnEntryDef(Dominatee))
+    return false;
+
+  // When Dominator is defined on function entry, it dominates the other memory
+  // access.
+  if (isLiveOnEntryDef(Dominator))
+    return true;
+
+  // Get the access list for the block
+  const AccessList *AccessList = getBlockAccesses(Dominator->getBlock());
+  AccessList::const_reverse_iterator It(Dominator->getIterator());
+
+  // If we hit the beginning of the access list before we hit dominatee, we must
+  // dominate it
+  return std::none_of(It, AccessList->rend(),
+                      [&](const MemoryAccess &MA) { return &MA == Dominatee; });
+}
+
+const static char LiveOnEntryStr[] = "liveOnEntry";
+
+void MemoryDef::print(raw_ostream &OS) const {
+  MemoryAccess *UO = getDefiningAccess();
+
+  OS << getID() << " = MemoryDef(";
+  if (UO && UO->getID())
+    OS << UO->getID();
+  else
+    OS << LiveOnEntryStr;
+  OS << ')';
+}
+
+void MemoryPhi::print(raw_ostream &OS) const {
+  bool First = true;
+  OS << getID() << " = MemoryPhi(";
+  for (const auto &Op : operands()) {
+    BasicBlock *BB = getIncomingBlock(Op);
+    MemoryAccess *MA = cast<MemoryAccess>(Op);
+    if (!First)
+      OS << ',';
+    else
+      First = false;
+
+    OS << '{';
+    if (BB->hasName())
+      OS << BB->getName();
+    else
+      BB->printAsOperand(OS, false);
+    OS << ',';
+    if (unsigned ID = MA->getID())
+      OS << ID;
+    else
+      OS << LiveOnEntryStr;
+    OS << '}';
+  }
+  OS << ')';
+}
+
+MemoryAccess::~MemoryAccess() {}
+
+void MemoryUse::print(raw_ostream &OS) const {
+  MemoryAccess *UO = getDefiningAccess();
+  OS << "MemoryUse(";
+  if (UO && UO->getID())
+    OS << UO->getID();
+  else
+    OS << LiveOnEntryStr;
+  OS << ')';
+}
+
+void MemoryAccess::dump() const {
+  print(dbgs());
+  dbgs() << "\n";
+}
+
+char MemorySSAPrinterLegacyPass::ID = 0;
+
+MemorySSAPrinterLegacyPass::MemorySSAPrinterLegacyPass() : FunctionPass(ID) {
+  initializeMemorySSAPrinterLegacyPassPass(*PassRegistry::getPassRegistry());
+}
+
+void MemorySSAPrinterLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MemorySSAWrapperPass>();
+  AU.addPreserved<MemorySSAWrapperPass>();
+}
+
+bool MemorySSAPrinterLegacyPass::runOnFunction(Function &F) {
+  auto &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
+  MSSA.print(dbgs());
+  if (VerifyMemorySSA)
+    MSSA.verifyMemorySSA();
+  return false;
+}
+
+char MemorySSAAnalysis::PassID;
+
+MemorySSA MemorySSAAnalysis::run(Function &F, AnalysisManager<Function> &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+  return MemorySSA(F, &AA, &DT);
+}
+
+PreservedAnalyses MemorySSAPrinterPass::run(Function &F,
+                                            FunctionAnalysisManager &AM) {
+  OS << "MemorySSA for function: " << F.getName() << "\n";
+  AM.getResult<MemorySSAAnalysis>(F).print(OS);
+
+  return PreservedAnalyses::all();
+}
+
+PreservedAnalyses MemorySSAVerifierPass::run(Function &F,
+                                             FunctionAnalysisManager &AM) {
+  AM.getResult<MemorySSAAnalysis>(F).verifyMemorySSA();
+
+  return PreservedAnalyses::all();
+}
+
+char MemorySSAWrapperPass::ID = 0;
+
+MemorySSAWrapperPass::MemorySSAWrapperPass() : FunctionPass(ID) {
+  initializeMemorySSAWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+void MemorySSAWrapperPass::releaseMemory() { MSSA.reset(); }
+
+void MemorySSAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequiredTransitive<DominatorTreeWrapperPass>();
+  AU.addRequiredTransitive<AAResultsWrapperPass>();
+}
+
+bool MemorySSAWrapperPass::runOnFunction(Function &F) {
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+  MSSA.reset(new MemorySSA(F, &AA, &DT));
+  return false;
+}
+
+void MemorySSAWrapperPass::verifyAnalysis() const { MSSA->verifyMemorySSA(); }
+
+void MemorySSAWrapperPass::print(raw_ostream &OS, const Module *M) const {
+  MSSA->print(OS);
+}
+
+MemorySSAWalker::MemorySSAWalker(MemorySSA *M) : MSSA(M) {}
+
+MemorySSA::CachingWalker::CachingWalker(MemorySSA *M, AliasAnalysis *A,
+                                        DominatorTree *D)
+    : MemorySSAWalker(M), AA(A), DT(D) {}
+
+MemorySSA::CachingWalker::~CachingWalker() {}
+
+struct MemorySSA::CachingWalker::UpwardsMemoryQuery {
+  // True if we saw a phi whose predecessor was a backedge
+  bool SawBackedgePhi;
+  // True if our original query started off as a call
+  bool IsCall;
+  // The pointer location we started the query with. This will be empty if
+  // IsCall is true.
+  MemoryLocation StartingLoc;
+  // This is the instruction we were querying about.
+  const Instruction *Inst;
+  // Set of visited Instructions for this query.
+  DenseSet<MemoryAccessPair> Visited;
+  // Vector of visited call accesses for this query. This is separated out
+  // because you can always cache and lookup the result of call queries (IE when
+  // IsCall == true) for every call in the chain. The calls have no AA location
+  // associated with them with them, and thus, no context dependence.
+  SmallVector<const MemoryAccess *, 32> VisitedCalls;
+  // The MemoryAccess we actually got called with, used to test local domination
+  const MemoryAccess *OriginalAccess;
+
+  UpwardsMemoryQuery()
+      : SawBackedgePhi(false), IsCall(false), Inst(nullptr),
+        OriginalAccess(nullptr) {}
+
+  UpwardsMemoryQuery(const Instruction *Inst, const MemoryAccess *Access)
+      : SawBackedgePhi(false), IsCall(ImmutableCallSite(Inst)), Inst(Inst),
+        OriginalAccess(Access) {}
+};
+
+void MemorySSA::CachingWalker::invalidateInfo(MemoryAccess *MA) {
+
+  // TODO: We can do much better cache invalidation with differently stored
+  // caches.  For now, for MemoryUses, we simply remove them
+  // from the cache, and kill the entire call/non-call cache for everything
+  // else.  The problem is for phis or defs, currently we'd need to follow use
+  // chains down and invalidate anything below us in the chain that currently
+  // terminates at this access.
+
+  // See if this is a MemoryUse, if so, just remove the cached info. MemoryUse
+  // is by definition never a barrier, so nothing in the cache could point to
+  // this use. In that case, we only need invalidate the info for the use
+  // itself.
+
+  if (MemoryUse *MU = dyn_cast<MemoryUse>(MA)) {
+    UpwardsMemoryQuery Q;
+    Instruction *I = MU->getMemoryInst();
+    Q.IsCall = bool(ImmutableCallSite(I));
+    Q.Inst = I;
+    if (!Q.IsCall)
+      Q.StartingLoc = MemoryLocation::get(I);
+    doCacheRemove(MA, Q, Q.StartingLoc);
+  } else {
+    // If it is not a use, the best we can do right now is destroy the cache.
+    CachedUpwardsClobberingCall.clear();
+    CachedUpwardsClobberingAccess.clear();
+  }
+
+#ifdef EXPENSIVE_CHECKS
+  // Run this only when expensive checks are enabled.
+  verifyRemoved(MA);
+#endif
+}
+
+void MemorySSA::CachingWalker::doCacheRemove(const MemoryAccess *M,
+                                             const UpwardsMemoryQuery &Q,
+                                             const MemoryLocation &Loc) {
+  if (Q.IsCall)
+    CachedUpwardsClobberingCall.erase(M);
+  else
+    CachedUpwardsClobberingAccess.erase({M, Loc});
+}
+
+void MemorySSA::CachingWalker::doCacheInsert(const MemoryAccess *M,
+                                             MemoryAccess *Result,
+                                             const UpwardsMemoryQuery &Q,
+                                             const MemoryLocation &Loc) {
+  // This is fine for Phis, since there are times where we can't optimize them.
+  // Making a def its own clobber is never correct, though.
+  assert((Result != M || isa<MemoryPhi>(M)) &&
+         "Something can't clobber itself!");
+  ++NumClobberCacheInserts;
+  if (Q.IsCall)
+    CachedUpwardsClobberingCall[M] = Result;
+  else
+    CachedUpwardsClobberingAccess[{M, Loc}] = Result;
+}
+
+MemoryAccess *
+MemorySSA::CachingWalker::doCacheLookup(const MemoryAccess *M,
+                                        const UpwardsMemoryQuery &Q,
+                                        const MemoryLocation &Loc) {
+  ++NumClobberCacheLookups;
+  MemoryAccess *Result;
+
+  if (Q.IsCall)
+    Result = CachedUpwardsClobberingCall.lookup(M);
+  else
+    Result = CachedUpwardsClobberingAccess.lookup({M, Loc});
+
+  if (Result)
+    ++NumClobberCacheHits;
+  return Result;
+}
+
+bool MemorySSA::CachingWalker::instructionClobbersQuery(
+    const MemoryDef *MD, UpwardsMemoryQuery &Q,
+    const MemoryLocation &Loc) const {
+  Instruction *DefMemoryInst = MD->getMemoryInst();
+  assert(DefMemoryInst && "Defining instruction not actually an instruction");
+
+  if (!Q.IsCall)
+    return AA->getModRefInfo(DefMemoryInst, Loc) & MRI_Mod;
+
+  // If this is a call, mark it for caching
+  if (ImmutableCallSite(DefMemoryInst))
+    Q.VisitedCalls.push_back(MD);
+  ModRefInfo I = AA->getModRefInfo(DefMemoryInst, ImmutableCallSite(Q.Inst));
+  return I != MRI_NoModRef;
+}
+
+MemoryAccessPair MemorySSA::CachingWalker::UpwardsDFSWalk(
+    MemoryAccess *StartingAccess, const MemoryLocation &Loc,
+    UpwardsMemoryQuery &Q, bool FollowingBackedge) {
+  MemoryAccess *ModifyingAccess = nullptr;
+
+  auto DFI = df_begin(StartingAccess);
+  for (auto DFE = df_end(StartingAccess); DFI != DFE;) {
+    MemoryAccess *CurrAccess = *DFI;
+    if (MSSA->isLiveOnEntryDef(CurrAccess))
+      return {CurrAccess, Loc};
+    // If this is a MemoryDef, check whether it clobbers our current query. This
+    // needs to be done before consulting the cache, because the cache reports
+    // the clobber for CurrAccess. If CurrAccess is a clobber for this query,
+    // and we ask the cache for information first, then we might skip this
+    // clobber, which is bad.
+    if (auto *MD = dyn_cast<MemoryDef>(CurrAccess)) {
+      // If we hit the top, stop following this path.
+      // While we can do lookups, we can't sanely do inserts here unless we were
+      // to track everything we saw along the way, since we don't know where we
+      // will stop.
+      if (instructionClobbersQuery(MD, Q, Loc)) {
+        ModifyingAccess = CurrAccess;
+        break;
+      }
+    }
+    if (auto CacheResult = doCacheLookup(CurrAccess, Q, Loc))
+      return {CacheResult, Loc};
+
+    // We need to know whether it is a phi so we can track backedges.
+    // Otherwise, walk all upward defs.
+    if (!isa<MemoryPhi>(CurrAccess)) {
+      ++DFI;
+      continue;
+    }
+
+#ifndef NDEBUG
+    // The loop below visits the phi's children for us. Because phis are the
+    // only things with multiple edges, skipping the children should always lead
+    // us to the end of the loop.
+    //
+    // Use a copy of DFI because skipChildren would kill our search stack, which
+    // would make caching anything on the way back impossible.
+    auto DFICopy = DFI;
+    assert(DFICopy.skipChildren() == DFE &&
+           "Skipping phi's children doesn't end the DFS?");
+#endif
+
+    const MemoryAccessPair PHIPair(CurrAccess, Loc);
+
+    // Don't try to optimize this phi again if we've already tried to do so.
+    if (!Q.Visited.insert(PHIPair).second) {
+      ModifyingAccess = CurrAccess;
+      break;
+    }
+
+    std::size_t InitialVisitedCallSize = Q.VisitedCalls.size();
+
+    // Recurse on PHI nodes, since we need to change locations.
+    // TODO: Allow graphtraits on pairs, which would turn this whole function
+    // into a normal single depth first walk.
+    MemoryAccess *FirstDef = nullptr;
+    for (auto MPI = upward_defs_begin(PHIPair), MPE = upward_defs_end();
+         MPI != MPE; ++MPI) {
+      bool Backedge =
+          !FollowingBackedge &&
+          DT->dominates(CurrAccess->getBlock(), MPI.getPhiArgBlock());
+
+      MemoryAccessPair CurrentPair =
+          UpwardsDFSWalk(MPI->first, MPI->second, Q, Backedge);
+      // All the phi arguments should reach the same point if we can bypass
+      // this phi. The alternative is that they hit this phi node, which
+      // means we can skip this argument.
+      if (FirstDef && CurrentPair.first != PHIPair.first &&
+          CurrentPair.first != FirstDef) {
+        ModifyingAccess = CurrAccess;
+        break;
+      }
+
+      if (!FirstDef)
+        FirstDef = CurrentPair.first;
+    }
+
+    // If we exited the loop early, go with the result it gave us.
+    if (!ModifyingAccess) {
+      assert(FirstDef && "Found a Phi with no upward defs?");
+      ModifyingAccess = FirstDef;
+    } else {
+      // If we can't optimize this Phi, then we can't safely cache any of the
+      // calls we visited when trying to optimize it. Wipe them out now.
+      Q.VisitedCalls.resize(InitialVisitedCallSize);
+    }
+    break;
+  }
+
+  if (!ModifyingAccess)
+    return {MSSA->getLiveOnEntryDef(), Q.StartingLoc};
+
+  const BasicBlock *OriginalBlock = StartingAccess->getBlock();
+  assert(DFI.getPathLength() > 0 && "We dropped our path?");
+  unsigned N = DFI.getPathLength();
+  // If we found a clobbering def, the last element in the path will be our
+  // clobber, so we don't want to cache that to itself. OTOH, if we optimized a
+  // phi, we can add the last thing in the path to the cache, since that won't
+  // be the result.
+  if (DFI.getPath(N - 1) == ModifyingAccess)
+    --N;
+  for (; N > 1; --N) {
+    MemoryAccess *CacheAccess = DFI.getPath(N - 1);
+    BasicBlock *CurrBlock = CacheAccess->getBlock();
+    if (!FollowingBackedge)
+      doCacheInsert(CacheAccess, ModifyingAccess, Q, Loc);
+    if (DT->dominates(CurrBlock, OriginalBlock) &&
+        (CurrBlock != OriginalBlock || !FollowingBackedge ||
+         MSSA->locallyDominates(CacheAccess, StartingAccess)))
+      break;
+  }
+
+  // Cache everything else on the way back. The caller should cache
+  // StartingAccess for us.
+  for (; N > 1; --N) {
+    MemoryAccess *CacheAccess = DFI.getPath(N - 1);
+    doCacheInsert(CacheAccess, ModifyingAccess, Q, Loc);
+  }
+
+  return {ModifyingAccess, Loc};
+}
+
+/// \brief Walk the use-def chains starting at \p MA and find
+/// the MemoryAccess that actually clobbers Loc.
+///
+/// \returns our clobbering memory access
+MemoryAccess *MemorySSA::CachingWalker::getClobberingMemoryAccess(
+    MemoryAccess *StartingAccess, UpwardsMemoryQuery &Q) {
+  return UpwardsDFSWalk(StartingAccess, Q.StartingLoc, Q, false).first;
+}
+
+MemoryAccess *MemorySSA::CachingWalker::getClobberingMemoryAccess(
+    MemoryAccess *StartingAccess, MemoryLocation &Loc) {
+  if (isa<MemoryPhi>(StartingAccess))
+    return StartingAccess;
+
+  auto *StartingUseOrDef = cast<MemoryUseOrDef>(StartingAccess);
+  if (MSSA->isLiveOnEntryDef(StartingUseOrDef))
+    return StartingUseOrDef;
+
+  Instruction *I = StartingUseOrDef->getMemoryInst();
+
+  // Conservatively, fences are always clobbers, so don't perform the walk if we
+  // hit a fence.
+  if (!ImmutableCallSite(I) && I->isFenceLike())
+    return StartingUseOrDef;
+
+  UpwardsMemoryQuery Q;
+  Q.OriginalAccess = StartingUseOrDef;
+  Q.StartingLoc = Loc;
+  Q.Inst = StartingUseOrDef->getMemoryInst();
+  Q.IsCall = false;
+
+  if (auto CacheResult = doCacheLookup(StartingUseOrDef, Q, Q.StartingLoc))
+    return CacheResult;
+
+  // Unlike the other function, do not walk to the def of a def, because we are
+  // handed something we already believe is the clobbering access.
+  MemoryAccess *DefiningAccess = isa<MemoryUse>(StartingUseOrDef)
+                                     ? StartingUseOrDef->getDefiningAccess()
+                                     : StartingUseOrDef;
+
+  MemoryAccess *Clobber = getClobberingMemoryAccess(DefiningAccess, Q);
+  // Only cache this if it wouldn't make Clobber point to itself.
+  if (Clobber != StartingAccess)
+    doCacheInsert(Q.OriginalAccess, Clobber, Q, Q.StartingLoc);
+  DEBUG(dbgs() << "Starting Memory SSA clobber for " << *I << " is ");
+  DEBUG(dbgs() << *StartingUseOrDef << "\n");
+  DEBUG(dbgs() << "Final Memory SSA clobber for " << *I << " is ");
+  DEBUG(dbgs() << *Clobber << "\n");
+  return Clobber;
+}
+
+MemoryAccess *
+MemorySSA::CachingWalker::getClobberingMemoryAccess(const Instruction *I) {
+  // There should be no way to lookup an instruction and get a phi as the
+  // access, since we only map BB's to PHI's. So, this must be a use or def.
+  auto *StartingAccess = cast<MemoryUseOrDef>(MSSA->getMemoryAccess(I));
+
+  bool IsCall = bool(ImmutableCallSite(I));
+
+  // We can't sanely do anything with a fences, they conservatively
+  // clobber all memory, and have no locations to get pointers from to
+  // try to disambiguate.
+  if (!IsCall && I->isFenceLike())
+    return StartingAccess;
+
+  UpwardsMemoryQuery Q;
+  Q.OriginalAccess = StartingAccess;
+  Q.IsCall = IsCall;
+  if (!Q.IsCall)
+    Q.StartingLoc = MemoryLocation::get(I);
+  Q.Inst = I;
+  if (auto CacheResult = doCacheLookup(StartingAccess, Q, Q.StartingLoc))
+    return CacheResult;
+
+  // Start with the thing we already think clobbers this location
+  MemoryAccess *DefiningAccess = StartingAccess->getDefiningAccess();
+
+  // At this point, DefiningAccess may be the live on entry def.
+  // If it is, we will not get a better result.
+  if (MSSA->isLiveOnEntryDef(DefiningAccess))
+    return DefiningAccess;
+
+  MemoryAccess *Result = getClobberingMemoryAccess(DefiningAccess, Q);
+  // DFS won't cache a result for DefiningAccess. So, if DefiningAccess isn't
+  // our clobber, be sure that it gets a cache entry, too.
+  if (Result != DefiningAccess)
+    doCacheInsert(DefiningAccess, Result, Q, Q.StartingLoc);
+  doCacheInsert(Q.OriginalAccess, Result, Q, Q.StartingLoc);
+  // TODO: When this implementation is more mature, we may want to figure out
+  // what this additional caching buys us. It's most likely A Good Thing.
+  if (Q.IsCall)
+    for (const MemoryAccess *MA : Q.VisitedCalls)
+      if (MA != Result)
+        doCacheInsert(MA, Result, Q, Q.StartingLoc);
+
+  DEBUG(dbgs() << "Starting Memory SSA clobber for " << *I << " is ");
+  DEBUG(dbgs() << *DefiningAccess << "\n");
+  DEBUG(dbgs() << "Final Memory SSA clobber for " << *I << " is ");
+  DEBUG(dbgs() << *Result << "\n");
+
+  return Result;
+}
+
+// Verify that MA doesn't exist in any of the caches.
+void MemorySSA::CachingWalker::verifyRemoved(MemoryAccess *MA) {
+#ifndef NDEBUG
+  for (auto &P : CachedUpwardsClobberingAccess)
+    assert(P.first.first != MA && P.second != MA &&
+           "Found removed MemoryAccess in cache.");
+  for (auto &P : CachedUpwardsClobberingCall)
+    assert(P.first != MA && P.second != MA &&
+           "Found removed MemoryAccess in cache.");
+#endif // !NDEBUG
+}
+
+MemoryAccess *
+DoNothingMemorySSAWalker::getClobberingMemoryAccess(const Instruction *I) {
+  MemoryAccess *MA = MSSA->getMemoryAccess(I);
+  if (auto *Use = dyn_cast<MemoryUseOrDef>(MA))
+    return Use->getDefiningAccess();
+  return MA;
+}
+
+MemoryAccess *DoNothingMemorySSAWalker::getClobberingMemoryAccess(
+    MemoryAccess *StartingAccess, MemoryLocation &) {
+  if (auto *Use = dyn_cast<MemoryUseOrDef>(StartingAccess))
+    return Use->getDefiningAccess();
+  return StartingAccess;
+}
+}
diff --git a/lib/Transforms/Utils/ModuleUtils.cpp b/lib/Transforms/Utils/ModuleUtils.cpp
index 9ec28a3f3d47c..eb91885186244 100644
--- a/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/lib/Transforms/Utils/ModuleUtils.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/ModuleUtils.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -21,8 +20,8 @@
 
 using namespace llvm;
 
-static void appendToGlobalArray(const char *Array, 
-                                Module &M, Function *F, int Priority) {
+static void appendToGlobalArray(const char *Array, Module &M, Function *F,
+                                int Priority, Constant *Data) {
   IRBuilder<> IRB(M.getContext());
   FunctionType *FnTy = FunctionType::get(IRB.getVoidTy(), false);
 
@@ -31,15 +30,26 @@ static void appendToGlobalArray(const char *Array,
   SmallVector<Constant *, 16> CurrentCtors;
   StructType *EltTy;
   if (GlobalVariable *GVCtor = M.getNamedGlobal(Array)) {
-    // If there is a global_ctors array, use the existing struct type, which can
-    // have 2 or 3 fields.
-    ArrayType *ATy = cast<ArrayType>(GVCtor->getType()->getElementType());
-    EltTy = cast<StructType>(ATy->getElementType());
+    ArrayType *ATy = cast<ArrayType>(GVCtor->getValueType());
+    StructType *OldEltTy = cast<StructType>(ATy->getElementType());
+    // Upgrade a 2-field global array type to the new 3-field format if needed.
+    if (Data && OldEltTy->getNumElements() < 3)
+      EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy),
+                              IRB.getInt8PtrTy(), nullptr);
+    else
+      EltTy = OldEltTy;
     if (Constant *Init = GVCtor->getInitializer()) {
       unsigned n = Init->getNumOperands();
       CurrentCtors.reserve(n + 1);
-      for (unsigned i = 0; i != n; ++i)
-        CurrentCtors.push_back(cast<Constant>(Init->getOperand(i)));
+      for (unsigned i = 0; i != n; ++i) {
+        auto Ctor = cast<Constant>(Init->getOperand(i));
+        if (EltTy != OldEltTy)
+          Ctor = ConstantStruct::get(
+              EltTy, Ctor->getAggregateElement((unsigned)0),
+              Ctor->getAggregateElement(1),
+              Constant::getNullValue(IRB.getInt8PtrTy()), nullptr);
+        CurrentCtors.push_back(Ctor);
+      }
     }
     GVCtor->eraseFromParent();
   } else {
@@ -54,7 +64,8 @@ static void appendToGlobalArray(const char *Array,
   CSVals[1] = F;
   // FIXME: Drop support for the two element form in LLVM 4.0.
   if (EltTy->getNumElements() >= 3)
-    CSVals[2] = llvm::Constant::getNullValue(IRB.getInt8PtrTy());
+    CSVals[2] = Data ? ConstantExpr::getPointerCast(Data, IRB.getInt8PtrTy())
+                     : Constant::getNullValue(IRB.getInt8PtrTy());
   Constant *RuntimeCtorInit =
       ConstantStruct::get(EltTy, makeArrayRef(CSVals, EltTy->getNumElements()));
 
@@ -70,29 +81,12 @@ static void appendToGlobalArray(const char *Array,
                            GlobalValue::AppendingLinkage, NewInit, Array);
 }
 
-void llvm::appendToGlobalCtors(Module &M, Function *F, int Priority) {
-  appendToGlobalArray("llvm.global_ctors", M, F, Priority);
+void llvm::appendToGlobalCtors(Module &M, Function *F, int Priority, Constant *Data) {
+  appendToGlobalArray("llvm.global_ctors", M, F, Priority, Data);
 }
 
-void llvm::appendToGlobalDtors(Module &M, Function *F, int Priority) {
-  appendToGlobalArray("llvm.global_dtors", M, F, Priority);
-}
-
-GlobalVariable *
-llvm::collectUsedGlobalVariables(Module &M, SmallPtrSetImpl<GlobalValue *> &Set,
-                                 bool CompilerUsed) {
-  const char *Name = CompilerUsed ? "llvm.compiler.used" : "llvm.used";
-  GlobalVariable *GV = M.getGlobalVariable(Name);
-  if (!GV || !GV->hasInitializer())
-    return GV;
-
-  const ConstantArray *Init = cast<ConstantArray>(GV->getInitializer());
-  for (unsigned I = 0, E = Init->getNumOperands(); I != E; ++I) {
-    Value *Op = Init->getOperand(I);
-    GlobalValue *G = cast<GlobalValue>(Op->stripPointerCastsNoFollowAliases());
-    Set.insert(G);
-  }
-  return GV;
+void llvm::appendToGlobalDtors(Module &M, Function *F, int Priority, Constant *Data) {
+  appendToGlobalArray("llvm.global_dtors", M, F, Priority, Data);
 }
 
 Function *llvm::checkSanitizerInterfaceFunction(Constant *FuncOrBitcast) {
@@ -132,4 +126,3 @@ std::pair<Function *, Function *> llvm::createSanitizerCtorAndInitFunctions(
   }
   return std::make_pair(Ctor, InitFunction);
 }
-
diff --git a/lib/Transforms/Utils/NameAnonFunctions.cpp b/lib/Transforms/Utils/NameAnonFunctions.cpp
new file mode 100644
index 0000000000000..c4f3839d8482a
--- /dev/null
+++ b/lib/Transforms/Utils/NameAnonFunctions.cpp
@@ -0,0 +1,102 @@
+//===- NameAnonFunctions.cpp - ThinLTO Summary-based Function Import ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements naming anonymous function to make sure they can be
+// refered to by ThinLTO.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+// Compute a "unique" hash for the module based on the name of the public
+// functions.
+class ModuleHasher {
+  Module &TheModule;
+  std::string TheHash;
+
+public:
+  ModuleHasher(Module &M) : TheModule(M) {}
+
+  /// Return the lazily computed hash.
+  std::string &get() {
+    if (!TheHash.empty())
+      // Cache hit :)
+      return TheHash;
+
+    MD5 Hasher;
+    for (auto &F : TheModule) {
+      if (F.isDeclaration() || F.hasLocalLinkage() || !F.hasName())
+        continue;
+      auto Name = F.getName();
+      Hasher.update(Name);
+    }
+    for (auto &GV : TheModule.globals()) {
+      if (GV.isDeclaration() || GV.hasLocalLinkage() || !GV.hasName())
+        continue;
+      auto Name = GV.getName();
+      Hasher.update(Name);
+    }
+
+    // Now return the result.
+    MD5::MD5Result Hash;
+    Hasher.final(Hash);
+    SmallString<32> Result;
+    MD5::stringifyResult(Hash, Result);
+    TheHash = Result.str();
+    return TheHash;
+  }
+};
+
+// Rename all the anon functions in the module
+bool llvm::nameUnamedFunctions(Module &M) {
+  bool Changed = false;
+  ModuleHasher ModuleHash(M);
+  int count = 0;
+  for (auto &F : M) {
+    if (F.hasName())
+      continue;
+    F.setName(Twine("anon.") + ModuleHash.get() + "." + Twine(count++));
+    Changed = true;
+  }
+  return Changed;
+}
+
+namespace {
+
+// Simple pass that provides a name to every anon function.
+class NameAnonFunction : public ModulePass {
+
+public:
+  /// Pass identification, replacement for typeid
+  static char ID;
+
+  /// Specify pass name for debug output
+  const char *getPassName() const override { return "Name Anon Functions"; }
+
+  explicit NameAnonFunction() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override { return nameUnamedFunctions(M); }
+};
+char NameAnonFunction::ID = 0;
+
+} // anonymous namespace
+
+INITIALIZE_PASS_BEGIN(NameAnonFunction, "name-anon-functions",
+                      "Provide a name to nameless functions", false, false)
+INITIALIZE_PASS_END(NameAnonFunction, "name-anon-functions",
+                    "Provide a name to nameless functions", false, false)
+
+namespace llvm {
+ModulePass *createNameAnonFunctionPass() { return new NameAnonFunction(); }
+}
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index c4f9b9f614078..cbf385d563399 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -523,7 +523,7 @@ void PromoteMem2Reg::run() {
 
   AllocaInfo Info;
   LargeBlockInfo LBI;
-  IDFCalculator IDF(DT);
+  ForwardIDFCalculator IDF(DT);
 
   for (unsigned AllocaNum = 0; AllocaNum != Allocas.size(); ++AllocaNum) {
     AllocaInst *AI = Allocas[AllocaNum];
@@ -802,7 +802,8 @@ void PromoteMem2Reg::ComputeLiveInBlocks(
         // actually live-in here.
         LiveInBlockWorklist[i] = LiveInBlockWorklist.back();
         LiveInBlockWorklist.pop_back();
-        --i, --e;
+        --i;
+        --e;
         break;
       }
 
diff --git a/lib/Transforms/Utils/SanitizerStats.cpp b/lib/Transforms/Utils/SanitizerStats.cpp
new file mode 100644
index 0000000000000..9afd175c10ed5
--- /dev/null
+++ b/lib/Transforms/Utils/SanitizerStats.cpp
@@ -0,0 +1,108 @@
+//===- SanitizerStats.cpp - Sanitizer statistics gathering ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements code generation for sanitizer statistics gathering.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SanitizerStats.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+
+using namespace llvm;
+
+SanitizerStatReport::SanitizerStatReport(Module *M) : M(M) {
+  StatTy = ArrayType::get(Type::getInt8PtrTy(M->getContext()), 2);
+  EmptyModuleStatsTy = makeModuleStatsTy();
+
+  ModuleStatsGV = new GlobalVariable(*M, EmptyModuleStatsTy, false,
+                                     GlobalValue::InternalLinkage, nullptr);
+}
+
+ArrayType *SanitizerStatReport::makeModuleStatsArrayTy() {
+  return ArrayType::get(StatTy, Inits.size());
+}
+
+StructType *SanitizerStatReport::makeModuleStatsTy() {
+  return StructType::get(M->getContext(), {Type::getInt8PtrTy(M->getContext()),
+                                           Type::getInt32Ty(M->getContext()),
+                                           makeModuleStatsArrayTy()});
+}
+
+void SanitizerStatReport::create(IRBuilder<> &B, SanitizerStatKind SK) {
+  Function *F = B.GetInsertBlock()->getParent();
+  Module *M = F->getParent();
+  PointerType *Int8PtrTy = B.getInt8PtrTy();
+  IntegerType *IntPtrTy = B.getIntPtrTy(M->getDataLayout());
+  ArrayType *StatTy = ArrayType::get(Int8PtrTy, 2);
+
+  Inits.push_back(ConstantArray::get(
+      StatTy,
+      {Constant::getNullValue(Int8PtrTy),
+       ConstantExpr::getIntToPtr(
+           ConstantInt::get(IntPtrTy, uint64_t(SK) << (IntPtrTy->getBitWidth() -
+                                                       kSanitizerStatKindBits)),
+           Int8PtrTy)}));
+
+  FunctionType *StatReportTy =
+      FunctionType::get(B.getVoidTy(), Int8PtrTy, false);
+  Constant *StatReport = M->getOrInsertFunction(
+      "__sanitizer_stat_report", StatReportTy);
+
+  auto InitAddr = ConstantExpr::getGetElementPtr(
+      EmptyModuleStatsTy, ModuleStatsGV,
+      ArrayRef<Constant *>{
+          ConstantInt::get(IntPtrTy, 0), ConstantInt::get(B.getInt32Ty(), 2),
+          ConstantInt::get(IntPtrTy, Inits.size() - 1),
+      });
+  B.CreateCall(StatReport, ConstantExpr::getBitCast(InitAddr, Int8PtrTy));
+}
+
+void SanitizerStatReport::finish() {
+  if (Inits.empty()) {
+    ModuleStatsGV->eraseFromParent();
+    return;
+  }
+
+  PointerType *Int8PtrTy = Type::getInt8PtrTy(M->getContext());
+  IntegerType *Int32Ty = Type::getInt32Ty(M->getContext());
+  Type *VoidTy = Type::getVoidTy(M->getContext());
+
+  // Create a new ModuleStatsGV to replace the old one. We can't just set the
+  // old one's initializer because its type is different.
+  auto NewModuleStatsGV = new GlobalVariable(
+      *M, makeModuleStatsTy(), false, GlobalValue::InternalLinkage,
+      ConstantStruct::getAnon(
+          {Constant::getNullValue(Int8PtrTy),
+           ConstantInt::get(Int32Ty, Inits.size()),
+           ConstantArray::get(makeModuleStatsArrayTy(), Inits)}));
+  ModuleStatsGV->replaceAllUsesWith(
+      ConstantExpr::getBitCast(NewModuleStatsGV, ModuleStatsGV->getType()));
+  ModuleStatsGV->eraseFromParent();
+
+  // Create a global constructor to register NewModuleStatsGV.
+  auto F = Function::Create(FunctionType::get(VoidTy, false),
+                            GlobalValue::InternalLinkage, "", M);
+  auto BB = BasicBlock::Create(M->getContext(), "", F);
+  IRBuilder<> B(BB);
+
+  FunctionType *StatInitTy = FunctionType::get(VoidTy, Int8PtrTy, false);
+  Constant *StatInit = M->getOrInsertFunction(
+      "__sanitizer_stat_init", StatInitTy);
+
+  B.CreateCall(StatInit, ConstantExpr::getBitCast(NewModuleStatsGV, Int8PtrTy));
+  B.CreateRetVoid();
+
+  appendToGlobalCtors(*M, F, 0);
+}
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index e484b690597e9..0504646c304ef 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetOperations.h"
@@ -45,6 +44,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
 #include <map>
@@ -58,17 +58,18 @@ using namespace PatternMatch;
 // a select, so the "clamp" idiom (of a min followed by a max) will be caught.
 // To catch this, we need to fold a compare and a select, hence '2' being the
 // minimum reasonable default.
-static cl::opt<unsigned>
-PHINodeFoldingThreshold("phi-node-folding-threshold", cl::Hidden, cl::init(2),
-   cl::desc("Control the amount of phi node folding to perform (default = 2)"));
+static cl::opt<unsigned> PHINodeFoldingThreshold(
+    "phi-node-folding-threshold", cl::Hidden, cl::init(2),
+    cl::desc(
+        "Control the amount of phi node folding to perform (default = 2)"));
 
-static cl::opt<bool>
-DupRet("simplifycfg-dup-ret", cl::Hidden, cl::init(false),
-       cl::desc("Duplicate return instructions into unconditional branches"));
+static cl::opt<bool> DupRet(
+    "simplifycfg-dup-ret", cl::Hidden, cl::init(false),
+    cl::desc("Duplicate return instructions into unconditional branches"));
 
 static cl::opt<bool>
-SinkCommon("simplifycfg-sink-common", cl::Hidden, cl::init(true),
-       cl::desc("Sink common instructions down to the end block"));
+    SinkCommon("simplifycfg-sink-common", cl::Hidden, cl::init(true),
+               cl::desc("Sink common instructions down to the end block"));
 
 static cl::opt<bool> HoistCondStores(
     "simplifycfg-hoist-cond-stores", cl::Hidden, cl::init(true),
@@ -96,48 +97,54 @@ static cl::opt<unsigned> MaxSpeculationDepth(
              "speculatively executed instructions"));
 
 STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
-STATISTIC(NumLinearMaps, "Number of switch instructions turned into linear mapping");
-STATISTIC(NumLookupTables, "Number of switch instructions turned into lookup tables");
-STATISTIC(NumLookupTablesHoles, "Number of switch instructions turned into lookup tables (holes checked)");
+STATISTIC(NumLinearMaps,
+          "Number of switch instructions turned into linear mapping");
+STATISTIC(NumLookupTables,
+          "Number of switch instructions turned into lookup tables");
+STATISTIC(
+    NumLookupTablesHoles,
+    "Number of switch instructions turned into lookup tables (holes checked)");
 STATISTIC(NumTableCmpReuses, "Number of reused switch table lookup compares");
-STATISTIC(NumSinkCommons, "Number of common instructions sunk down to the end block");
+STATISTIC(NumSinkCommons,
+          "Number of common instructions sunk down to the end block");
 STATISTIC(NumSpeculations, "Number of speculative executed instructions");
 
 namespace {
-  // The first field contains the value that the switch produces when a certain
-  // case group is selected, and the second field is a vector containing the
-  // cases composing the case group.
-  typedef SmallVector<std::pair<Constant *, SmallVector<ConstantInt *, 4>>, 2>
+// The first field contains the value that the switch produces when a certain
+// case group is selected, and the second field is a vector containing the
+// cases composing the case group.
+typedef SmallVector<std::pair<Constant *, SmallVector<ConstantInt *, 4>>, 2>
     SwitchCaseResultVectorTy;
-  // The first field contains the phi node that generates a result of the switch
-  // and the second field contains the value generated for a certain case in the
-  // switch for that PHI.
-  typedef SmallVector<std::pair<PHINode *, Constant *>, 4> SwitchCaseResultsTy;
+// The first field contains the phi node that generates a result of the switch
+// and the second field contains the value generated for a certain case in the
+// switch for that PHI.
+typedef SmallVector<std::pair<PHINode *, Constant *>, 4> SwitchCaseResultsTy;
 
-  /// ValueEqualityComparisonCase - Represents a case of a switch.
-  struct ValueEqualityComparisonCase {
-    ConstantInt *Value;
-    BasicBlock *Dest;
+/// ValueEqualityComparisonCase - Represents a case of a switch.
+struct ValueEqualityComparisonCase {
+  ConstantInt *Value;
+  BasicBlock *Dest;
 
-    ValueEqualityComparisonCase(ConstantInt *Value, BasicBlock *Dest)
+  ValueEqualityComparisonCase(ConstantInt *Value, BasicBlock *Dest)
       : Value(Value), Dest(Dest) {}
 
-    bool operator<(ValueEqualityComparisonCase RHS) const {
-      // Comparing pointers is ok as we only rely on the order for uniquing.
-      return Value < RHS.Value;
-    }
+  bool operator<(ValueEqualityComparisonCase RHS) const {
+    // Comparing pointers is ok as we only rely on the order for uniquing.
+    return Value < RHS.Value;
+  }
 
-    bool operator==(BasicBlock *RHSDest) const { return Dest == RHSDest; }
-  };
+  bool operator==(BasicBlock *RHSDest) const { return Dest == RHSDest; }
+};
 
 class SimplifyCFGOpt {
   const TargetTransformInfo &TTI;
   const DataLayout &DL;
   unsigned BonusInstThreshold;
   AssumptionCache *AC;
+  SmallPtrSetImpl<BasicBlock *> *LoopHeaders;
   Value *isValueEqualityComparison(TerminatorInst *TI);
-  BasicBlock *GetValueEqualityComparisonCases(TerminatorInst *TI,
-                               std::vector<ValueEqualityComparisonCase> &Cases);
+  BasicBlock *GetValueEqualityComparisonCases(
+      TerminatorInst *TI, std::vector<ValueEqualityComparisonCase> &Cases);
   bool SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
                                                      BasicBlock *Pred,
                                                      IRBuilder<> &Builder);
@@ -152,13 +159,15 @@ class SimplifyCFGOpt {
   bool SimplifyUnreachable(UnreachableInst *UI);
   bool SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder);
   bool SimplifyIndirectBr(IndirectBrInst *IBI);
-  bool SimplifyUncondBranch(BranchInst *BI, IRBuilder <> &Builder);
-  bool SimplifyCondBranch(BranchInst *BI, IRBuilder <>&Builder);
+  bool SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder);
+  bool SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder);
 
 public:
   SimplifyCFGOpt(const TargetTransformInfo &TTI, const DataLayout &DL,
-                 unsigned BonusInstThreshold, AssumptionCache *AC)
-      : TTI(TTI), DL(DL), BonusInstThreshold(BonusInstThreshold), AC(AC) {}
+                 unsigned BonusInstThreshold, AssumptionCache *AC,
+                 SmallPtrSetImpl<BasicBlock *> *LoopHeaders)
+      : TTI(TTI), DL(DL), BonusInstThreshold(BonusInstThreshold), AC(AC),
+        LoopHeaders(LoopHeaders) {}
   bool run(BasicBlock *BB);
 };
 }
@@ -166,19 +175,19 @@ public:
 /// Return true if it is safe to merge these two
 /// terminator instructions together.
 static bool SafeToMergeTerminators(TerminatorInst *SI1, TerminatorInst *SI2) {
-  if (SI1 == SI2) return false;  // Can't merge with self!
+  if (SI1 == SI2)
+    return false; // Can't merge with self!
 
   // It is not safe to merge these two switch instructions if they have a common
   // successor, and if that successor has a PHI node, and if *that* PHI node has
   // conflicting incoming values from the two switch blocks.
   BasicBlock *SI1BB = SI1->getParent();
   BasicBlock *SI2BB = SI2->getParent();
-  SmallPtrSet<BasicBlock*, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB));
+  SmallPtrSet<BasicBlock *, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB));
 
-  for (succ_iterator I = succ_begin(SI2BB), E = succ_end(SI2BB); I != E; ++I)
-    if (SI1Succs.count(*I))
-      for (BasicBlock::iterator BBI = (*I)->begin();
-           isa<PHINode>(BBI); ++BBI) {
+  for (BasicBlock *Succ : successors(SI2BB))
+    if (SI1Succs.count(Succ))
+      for (BasicBlock::iterator BBI = Succ->begin(); isa<PHINode>(BBI); ++BBI) {
         PHINode *PN = cast<PHINode>(BBI);
         if (PN->getIncomingValueForBlock(SI1BB) !=
             PN->getIncomingValueForBlock(SI2BB))
@@ -191,11 +200,12 @@ static bool SafeToMergeTerminators(TerminatorInst *SI1, TerminatorInst *SI2) {
 /// Return true if it is safe and profitable to merge these two terminator
 /// instructions together, where SI1 is an unconditional branch. PhiNodes will
 /// store all PHI nodes in common successors.
-static bool isProfitableToFoldUnconditional(BranchInst *SI1,
-                                          BranchInst *SI2,
-                                          Instruction *Cond,
-                                          SmallVectorImpl<PHINode*> &PhiNodes) {
-  if (SI1 == SI2) return false;  // Can't merge with self!
+static bool
+isProfitableToFoldUnconditional(BranchInst *SI1, BranchInst *SI2,
+                                Instruction *Cond,
+                                SmallVectorImpl<PHINode *> &PhiNodes) {
+  if (SI1 == SI2)
+    return false; // Can't merge with self!
   assert(SI1->isUnconditional() && SI2->isConditional());
 
   // We fold the unconditional branch if we can easily update all PHI nodes in
@@ -204,7 +214,8 @@ static bool isProfitableToFoldUnconditional(BranchInst *SI1,
   // 2> We have "Cond" as the incoming value for the unconditional branch;
   // 3> SI2->getCondition() and Cond have same operands.
   CmpInst *Ci2 = dyn_cast<CmpInst>(SI2->getCondition());
-  if (!Ci2) return false;
+  if (!Ci2)
+    return false;
   if (!(Cond->getOperand(0) == Ci2->getOperand(0) &&
         Cond->getOperand(1) == Ci2->getOperand(1)) &&
       !(Cond->getOperand(0) == Ci2->getOperand(1) &&
@@ -213,11 +224,10 @@ static bool isProfitableToFoldUnconditional(BranchInst *SI1,
 
   BasicBlock *SI1BB = SI1->getParent();
   BasicBlock *SI2BB = SI2->getParent();
-  SmallPtrSet<BasicBlock*, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB));
-  for (succ_iterator I = succ_begin(SI2BB), E = succ_end(SI2BB); I != E; ++I)
-    if (SI1Succs.count(*I))
-      for (BasicBlock::iterator BBI = (*I)->begin();
-           isa<PHINode>(BBI); ++BBI) {
+  SmallPtrSet<BasicBlock *, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB));
+  for (BasicBlock *Succ : successors(SI2BB))
+    if (SI1Succs.count(Succ))
+      for (BasicBlock::iterator BBI = Succ->begin(); isa<PHINode>(BBI); ++BBI) {
         PHINode *PN = cast<PHINode>(BBI);
         if (PN->getIncomingValueForBlock(SI1BB) != Cond ||
             !isa<ConstantInt>(PN->getIncomingValueForBlock(SI2BB)))
@@ -233,11 +243,11 @@ static bool isProfitableToFoldUnconditional(BranchInst *SI1,
 /// of Succ.
 static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred,
                                   BasicBlock *ExistPred) {
-  if (!isa<PHINode>(Succ->begin())) return; // Quick exit if nothing to do
+  if (!isa<PHINode>(Succ->begin()))
+    return; // Quick exit if nothing to do
 
   PHINode *PN;
-  for (BasicBlock::iterator I = Succ->begin();
-       (PN = dyn_cast<PHINode>(I)); ++I)
+  for (BasicBlock::iterator I = Succ->begin(); (PN = dyn_cast<PHINode>(I)); ++I)
     PN->addIncoming(PN->getIncomingValueForBlock(ExistPred), NewPred);
 }
 
@@ -270,7 +280,7 @@ static unsigned ComputeSpeculationCost(const User *I,
 /// V plus its non-dominating operands.  If that cost is greater than
 /// CostRemaining, false is returned and CostRemaining is undefined.
 static bool DominatesMergePoint(Value *V, BasicBlock *BB,
-                                SmallPtrSetImpl<Instruction*> *AggressiveInsts,
+                                SmallPtrSetImpl<Instruction *> *AggressiveInsts,
                                 unsigned &CostRemaining,
                                 const TargetTransformInfo &TTI,
                                 unsigned Depth = 0) {
@@ -294,7 +304,8 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
 
   // We don't want to allow weird loops that might have the "if condition" in
   // the bottom of this block.
-  if (PBB == BB) return false;
+  if (PBB == BB)
+    return false;
 
   // If this instruction is defined in a block that contains an unconditional
   // branch to BB, then it must be in the 'conditional' part of the "if
@@ -305,10 +316,12 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
 
   // If we aren't allowing aggressive promotion anymore, then don't consider
   // instructions in the 'if region'.
-  if (!AggressiveInsts) return false;
+  if (!AggressiveInsts)
+    return false;
 
   // If we have seen this instruction before, don't count it again.
-  if (AggressiveInsts->count(I)) return true;
+  if (AggressiveInsts->count(I))
+    return true;
 
   // Okay, it looks like the instruction IS in the "condition".  Check to
   // see if it's a cheap instruction to unconditionally compute, and if it
@@ -366,8 +379,8 @@ static ConstantInt *GetConstantInt(Value *V, const DataLayout &DL) {
         if (CI->getType() == PtrTy)
           return CI;
         else
-          return cast<ConstantInt>
-            (ConstantExpr::getIntegerCast(CI, PtrTy, /*isSigned=*/false));
+          return cast<ConstantInt>(
+              ConstantExpr::getIntegerCast(CI, PtrTy, /*isSigned=*/false));
       }
   return nullptr;
 }
@@ -403,11 +416,11 @@ struct ConstantComparesGatherer {
   operator=(const ConstantComparesGatherer &) = delete;
 
 private:
-
   /// Try to set the current value used for the comparison, it succeeds only if
   /// it wasn't set before or if the new value is the same as the old one
   bool setValueOnce(Value *NewVal) {
-    if(CompValue && CompValue != NewVal) return false;
+    if (CompValue && CompValue != NewVal)
+      return false;
     CompValue = NewVal;
     return (CompValue != nullptr);
   }
@@ -424,35 +437,99 @@ private:
     ICmpInst *ICI;
     ConstantInt *C;
     if (!((ICI = dyn_cast<ICmpInst>(I)) &&
-             (C = GetConstantInt(I->getOperand(1), DL)))) {
+          (C = GetConstantInt(I->getOperand(1), DL)))) {
       return false;
     }
 
     Value *RHSVal;
-    ConstantInt *RHSC;
+    const APInt *RHSC;
 
     // Pattern match a special case
-    // (x & ~2^x) == y --> x == y || x == y|2^x
+    // (x & ~2^z) == y --> x == y || x == y|2^z
     // This undoes a transformation done by instcombine to fuse 2 compares.
-    if (ICI->getPredicate() == (isEQ ? ICmpInst::ICMP_EQ:ICmpInst::ICMP_NE)) {
+    if (ICI->getPredicate() == (isEQ ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE)) {
+
+      // It's a little bit hard to see why the following transformations are
+      // correct. Here is a CVC3 program to verify them for 64-bit values:
+
+      /*
+         ONE  : BITVECTOR(64) = BVZEROEXTEND(0bin1, 63);
+         x    : BITVECTOR(64);
+         y    : BITVECTOR(64);
+         z    : BITVECTOR(64);
+         mask : BITVECTOR(64) = BVSHL(ONE, z);
+         QUERY( (y & ~mask = y) =>
+                ((x & ~mask = y) <=> (x = y OR x = (y |  mask)))
+         );
+         QUERY( (y |  mask = y) =>
+                ((x |  mask = y) <=> (x = y OR x = (y & ~mask)))
+         );
+      */
+
+      // Please note that each pattern must be a dual implication (<--> or
+      // iff). One directional implication can create spurious matches. If the
+      // implication is only one-way, an unsatisfiable condition on the left
+      // side can imply a satisfiable condition on the right side. Dual
+      // implication ensures that satisfiable conditions are transformed to
+      // other satisfiable conditions and unsatisfiable conditions are
+      // transformed to other unsatisfiable conditions.
+
+      // Here is a concrete example of a unsatisfiable condition on the left
+      // implying a satisfiable condition on the right:
+      //
+      // mask = (1 << z)
+      // (x & ~mask) == y  --> (x == y || x == (y | mask))
+      //
+      // Substituting y = 3, z = 0 yields:
+      // (x & -2) == 3 --> (x == 3 || x == 2)
+
+      // Pattern match a special case:
+      /*
+        QUERY( (y & ~mask = y) =>
+               ((x & ~mask = y) <=> (x = y OR x = (y |  mask)))
+        );
+      */
       if (match(ICI->getOperand(0),
-                m_And(m_Value(RHSVal), m_ConstantInt(RHSC)))) {
-        APInt Not = ~RHSC->getValue();
-        if (Not.isPowerOf2()) {
+                m_And(m_Value(RHSVal), m_APInt(RHSC)))) {
+        APInt Mask = ~*RHSC;
+        if (Mask.isPowerOf2() && (C->getValue() & ~Mask) == C->getValue()) {
           // If we already have a value for the switch, it has to match!
-          if(!setValueOnce(RHSVal))
+          if (!setValueOnce(RHSVal))
+            return false;
+
+          Vals.push_back(C);
+          Vals.push_back(
+              ConstantInt::get(C->getContext(),
+                               C->getValue() | Mask));
+          UsedICmps++;
+          return true;
+        }
+      }
+
+      // Pattern match a special case:
+      /*
+        QUERY( (y |  mask = y) =>
+               ((x |  mask = y) <=> (x = y OR x = (y & ~mask)))
+        );
+      */
+      if (match(ICI->getOperand(0),
+                m_Or(m_Value(RHSVal), m_APInt(RHSC)))) {
+        APInt Mask = *RHSC;
+        if (Mask.isPowerOf2() && (C->getValue() | Mask) == C->getValue()) {
+          // If we already have a value for the switch, it has to match!
+          if (!setValueOnce(RHSVal))
             return false;
 
           Vals.push_back(C);
           Vals.push_back(ConstantInt::get(C->getContext(),
-                                          C->getValue() | Not));
+                                          C->getValue() & ~Mask));
           UsedICmps++;
           return true;
         }
       }
 
       // If we already have a value for the switch, it has to match!
-      if(!setValueOnce(ICI->getOperand(0)))
+      if (!setValueOnce(ICI->getOperand(0)))
         return false;
 
       UsedICmps++;
@@ -467,8 +544,8 @@ private:
     // Shift the range if the compare is fed by an add. This is the range
     // compare idiom as emitted by instcombine.
     Value *CandidateVal = I->getOperand(0);
-    if(match(I->getOperand(0), m_Add(m_Value(RHSVal), m_ConstantInt(RHSC)))) {
-      Span = Span.subtract(RHSC->getValue());
+    if (match(I->getOperand(0), m_Add(m_Value(RHSVal), m_APInt(RHSC)))) {
+      Span = Span.subtract(*RHSC);
       CandidateVal = RHSVal;
     }
 
@@ -484,7 +561,7 @@ private:
     }
 
     // If we already have a value for the switch, it has to match!
-    if(!setValueOnce(CandidateVal))
+    if (!setValueOnce(CandidateVal))
       return false;
 
     // Add all values from the range to the set
@@ -493,7 +570,6 @@ private:
 
     UsedICmps++;
     return true;
-
   }
 
   /// Given a potentially 'or'd or 'and'd together collection of icmp
@@ -507,18 +583,22 @@ private:
 
     // Keep a stack (SmallVector for efficiency) for depth-first traversal
     SmallVector<Value *, 8> DFT;
+    SmallPtrSet<Value *, 8> Visited;
 
     // Initialize
+    Visited.insert(V);
     DFT.push_back(V);
 
-    while(!DFT.empty()) {
+    while (!DFT.empty()) {
       V = DFT.pop_back_val();
 
       if (Instruction *I = dyn_cast<Instruction>(V)) {
         // If it is a || (or && depending on isEQ), process the operands.
         if (I->getOpcode() == (isEQ ? Instruction::Or : Instruction::And)) {
-          DFT.push_back(I->getOperand(1));
-          DFT.push_back(I->getOperand(0));
+          if (Visited.insert(I->getOperand(1)).second)
+            DFT.push_back(I->getOperand(1));
+          if (Visited.insert(I->getOperand(0)).second)
+            DFT.push_back(I->getOperand(0));
           continue;
         }
 
@@ -541,7 +621,6 @@ private:
     }
   }
 };
-
 }
 
 static void EraseTerminatorInstAndDCECond(TerminatorInst *TI) {
@@ -556,7 +635,8 @@ static void EraseTerminatorInstAndDCECond(TerminatorInst *TI) {
   }
 
   TI->eraseFromParent();
-  if (Cond) RecursivelyDeleteTriviallyDeadInstructions(Cond);
+  if (Cond)
+    RecursivelyDeleteTriviallyDeadInstructions(Cond);
 }
 
 /// Return true if the specified terminator checks
@@ -566,8 +646,9 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) {
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
     // Do not permit merging of large switch instructions into their
     // predecessors unless there is only one predecessor.
-    if (SI->getNumSuccessors()*std::distance(pred_begin(SI->getParent()),
-                                             pred_end(SI->getParent())) <= 128)
+    if (SI->getNumSuccessors() * std::distance(pred_begin(SI->getParent()),
+                                               pred_end(SI->getParent())) <=
+        128)
       CV = SI->getCondition();
   } else if (BranchInst *BI = dyn_cast<BranchInst>(TI))
     if (BI->isConditional() && BI->getCondition()->hasOneUse())
@@ -589,46 +670,44 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) {
 
 /// Given a value comparison instruction,
 /// decode all of the 'cases' that it represents and return the 'default' block.
-BasicBlock *SimplifyCFGOpt::
-GetValueEqualityComparisonCases(TerminatorInst *TI,
-                                std::vector<ValueEqualityComparisonCase>
-                                                                       &Cases) {
+BasicBlock *SimplifyCFGOpt::GetValueEqualityComparisonCases(
+    TerminatorInst *TI, std::vector<ValueEqualityComparisonCase> &Cases) {
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
     Cases.reserve(SI->getNumCases());
-    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i)
-      Cases.push_back(ValueEqualityComparisonCase(i.getCaseValue(),
-                                                  i.getCaseSuccessor()));
+    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e;
+         ++i)
+      Cases.push_back(
+          ValueEqualityComparisonCase(i.getCaseValue(), i.getCaseSuccessor()));
     return SI->getDefaultDest();
   }
 
   BranchInst *BI = cast<BranchInst>(TI);
   ICmpInst *ICI = cast<ICmpInst>(BI->getCondition());
   BasicBlock *Succ = BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_NE);
-  Cases.push_back(ValueEqualityComparisonCase(GetConstantInt(ICI->getOperand(1),
-                                                             DL),
-                                              Succ));
+  Cases.push_back(ValueEqualityComparisonCase(
+      GetConstantInt(ICI->getOperand(1), DL), Succ));
   return BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_EQ);
 }
 
-
 /// Given a vector of bb/value pairs, remove any entries
 /// in the list that match the specified block.
-static void EliminateBlockCases(BasicBlock *BB,
-                              std::vector<ValueEqualityComparisonCase> &Cases) {
+static void
+EliminateBlockCases(BasicBlock *BB,
+                    std::vector<ValueEqualityComparisonCase> &Cases) {
   Cases.erase(std::remove(Cases.begin(), Cases.end(), BB), Cases.end());
 }
 
 /// Return true if there are any keys in C1 that exist in C2 as well.
-static bool
-ValuesOverlap(std::vector<ValueEqualityComparisonCase> &C1,
-              std::vector<ValueEqualityComparisonCase > &C2) {
+static bool ValuesOverlap(std::vector<ValueEqualityComparisonCase> &C1,
+                          std::vector<ValueEqualityComparisonCase> &C2) {
   std::vector<ValueEqualityComparisonCase> *V1 = &C1, *V2 = &C2;
 
   // Make V1 be smaller than V2.
   if (V1->size() > V2->size())
     std::swap(V1, V2);
 
-  if (V1->size() == 0) return false;
+  if (V1->size() == 0)
+    return false;
   if (V1->size() == 1) {
     // Just scan V2.
     ConstantInt *TheVal = (*V1)[0].Value;
@@ -657,30 +736,30 @@ ValuesOverlap(std::vector<ValueEqualityComparisonCase> &C1,
 /// also a value comparison with the same value, and if that comparison
 /// determines the outcome of this comparison. If so, simplify TI. This does a
 /// very limited form of jump threading.
-bool SimplifyCFGOpt::
-SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
-                                              BasicBlock *Pred,
-                                              IRBuilder<> &Builder) {
+bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
+    TerminatorInst *TI, BasicBlock *Pred, IRBuilder<> &Builder) {
   Value *PredVal = isValueEqualityComparison(Pred->getTerminator());
-  if (!PredVal) return false;  // Not a value comparison in predecessor.
+  if (!PredVal)
+    return false; // Not a value comparison in predecessor.
 
   Value *ThisVal = isValueEqualityComparison(TI);
   assert(ThisVal && "This isn't a value comparison!!");
-  if (ThisVal != PredVal) return false;  // Different predicates.
+  if (ThisVal != PredVal)
+    return false; // Different predicates.
 
   // TODO: Preserve branch weight metadata, similarly to how
   // FoldValueComparisonIntoPredecessors preserves it.
 
   // Find out information about when control will move from Pred to TI's block.
   std::vector<ValueEqualityComparisonCase> PredCases;
-  BasicBlock *PredDef = GetValueEqualityComparisonCases(Pred->getTerminator(),
-                                                        PredCases);
-  EliminateBlockCases(PredDef, PredCases);  // Remove default from cases.
+  BasicBlock *PredDef =
+      GetValueEqualityComparisonCases(Pred->getTerminator(), PredCases);
+  EliminateBlockCases(PredDef, PredCases); // Remove default from cases.
 
   // Find information about how control leaves this block.
   std::vector<ValueEqualityComparisonCase> ThisCases;
   BasicBlock *ThisDef = GetValueEqualityComparisonCases(TI, ThisCases);
-  EliminateBlockCases(ThisDef, ThisCases);  // Remove default from cases.
+  EliminateBlockCases(ThisDef, ThisCases); // Remove default from cases.
 
   // If TI's block is the default block from Pred's comparison, potentially
   // simplify TI based on this knowledge.
@@ -697,13 +776,14 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
       assert(ThisCases.size() == 1 && "Branch can only have one case!");
       // Insert the new branch.
       Instruction *NI = Builder.CreateBr(ThisDef);
-      (void) NI;
+      (void)NI;
 
       // Remove PHI node entries for the dead edge.
       ThisCases[0].Dest->removePredecessor(TI->getParent());
 
       DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
-           << "Through successor TI: " << *TI << "Leaving: " << *NI << "\n");
+                   << "Through successor TI: " << *TI << "Leaving: " << *NI
+                   << "\n");
 
       EraseTerminatorInstAndDCECond(TI);
       return true;
@@ -711,7 +791,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
 
     SwitchInst *SI = cast<SwitchInst>(TI);
     // Okay, TI has cases that are statically dead, prune them away.
-    SmallPtrSet<Constant*, 16> DeadCases;
+    SmallPtrSet<Constant *, 16> DeadCases;
     for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
       DeadCases.insert(PredCases[i].Value);
 
@@ -732,7 +812,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
       --i;
       if (DeadCases.count(i.getCaseValue())) {
         if (HasWeight) {
-          std::swap(Weights[i.getCaseIndex()+1], Weights.back());
+          std::swap(Weights[i.getCaseIndex() + 1], Weights.back());
           Weights.pop_back();
         }
         i.getCaseSuccessor()->removePredecessor(TI->getParent());
@@ -741,8 +821,8 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
     }
     if (HasWeight && Weights.size() >= 2)
       SI->setMetadata(LLVMContext::MD_prof,
-                      MDBuilder(SI->getParent()->getContext()).
-                      createBranchWeights(Weights));
+                      MDBuilder(SI->getParent()->getContext())
+                          .createBranchWeights(Weights));
 
     DEBUG(dbgs() << "Leaving: " << *TI << "\n");
     return true;
@@ -755,7 +835,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
   for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
     if (PredCases[i].Dest == TIBB) {
       if (TIV)
-        return false;  // Cannot handle multiple values coming to this block.
+        return false; // Cannot handle multiple values coming to this block.
       TIV = PredCases[i].Value;
     }
   assert(TIV && "No edge from pred to succ?");
@@ -770,53 +850,53 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
     }
 
   // If not handled by any explicit cases, it is handled by the default case.
-  if (!TheRealDest) TheRealDest = ThisDef;
+  if (!TheRealDest)
+    TheRealDest = ThisDef;
 
   // Remove PHI node entries for dead edges.
   BasicBlock *CheckEdge = TheRealDest;
-  for (succ_iterator SI = succ_begin(TIBB), e = succ_end(TIBB); SI != e; ++SI)
-    if (*SI != CheckEdge)
-      (*SI)->removePredecessor(TIBB);
+  for (BasicBlock *Succ : successors(TIBB))
+    if (Succ != CheckEdge)
+      Succ->removePredecessor(TIBB);
     else
       CheckEdge = nullptr;
 
   // Insert the new branch.
   Instruction *NI = Builder.CreateBr(TheRealDest);
-  (void) NI;
+  (void)NI;
 
   DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
-            << "Through successor TI: " << *TI << "Leaving: " << *NI << "\n");
+               << "Through successor TI: " << *TI << "Leaving: " << *NI
+               << "\n");
 
   EraseTerminatorInstAndDCECond(TI);
   return true;
 }
 
 namespace {
-  /// This class implements a stable ordering of constant
-  /// integers that does not depend on their address.  This is important for
-  /// applications that sort ConstantInt's to ensure uniqueness.
-  struct ConstantIntOrdering {
-    bool operator()(const ConstantInt *LHS, const ConstantInt *RHS) const {
-      return LHS->getValue().ult(RHS->getValue());
-    }
-  };
+/// This class implements a stable ordering of constant
+/// integers that does not depend on their address.  This is important for
+/// applications that sort ConstantInt's to ensure uniqueness.
+struct ConstantIntOrdering {
+  bool operator()(const ConstantInt *LHS, const ConstantInt *RHS) const {
+    return LHS->getValue().ult(RHS->getValue());
+  }
+};
 }
 
 static int ConstantIntSortPredicate(ConstantInt *const *P1,
                                     ConstantInt *const *P2) {
   const ConstantInt *LHS = *P1;
   const ConstantInt *RHS = *P2;
-  if (LHS->getValue().ult(RHS->getValue()))
-    return 1;
-  if (LHS->getValue() == RHS->getValue())
+  if (LHS == RHS)
     return 0;
-  return -1;
+  return LHS->getValue().ult(RHS->getValue()) ? 1 : -1;
 }
 
-static inline bool HasBranchWeights(const Instruction* I) {
+static inline bool HasBranchWeights(const Instruction *I) {
   MDNode *ProfMD = I->getMetadata(LLVMContext::MD_prof);
   if (ProfMD && ProfMD->getOperand(0))
-    if (MDString* MDS = dyn_cast<MDString>(ProfMD->getOperand(0)))
+    if (MDString *MDS = dyn_cast<MDString>(ProfMD->getOperand(0)))
       return MDS->getString().equals("branch_weights");
 
   return false;
@@ -837,7 +917,7 @@ static void GetBranchWeights(TerminatorInst *TI,
   // If TI is a conditional eq, the default case is the false case,
   // and the corresponding branch-weight data is at index 2. We swap the
   // default weight to be the first entry.
-  if (BranchInst* BI = dyn_cast<BranchInst>(TI)) {
+  if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
     assert(Weights.size() == 2);
     ICmpInst *ICI = cast<ICmpInst>(BI->getCondition());
     if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
@@ -862,17 +942,17 @@ static void FitWeights(MutableArrayRef<uint64_t> Weights) {
 bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
                                                          IRBuilder<> &Builder) {
   BasicBlock *BB = TI->getParent();
-  Value *CV = isValueEqualityComparison(TI);  // CondVal
+  Value *CV = isValueEqualityComparison(TI); // CondVal
   assert(CV && "Not a comparison?");
   bool Changed = false;
 
-  SmallVector<BasicBlock*, 16> Preds(pred_begin(BB), pred_end(BB));
+  SmallVector<BasicBlock *, 16> Preds(pred_begin(BB), pred_end(BB));
   while (!Preds.empty()) {
     BasicBlock *Pred = Preds.pop_back_val();
 
     // See if the predecessor is a comparison with the same value.
     TerminatorInst *PTI = Pred->getTerminator();
-    Value *PCV = isValueEqualityComparison(PTI);  // PredCondVal
+    Value *PCV = isValueEqualityComparison(PTI); // PredCondVal
 
     if (PCV == CV && SafeToMergeTerminators(TI, PTI)) {
       // Figure out which 'cases' to copy from SI to PSI.
@@ -885,7 +965,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
       // Based on whether the default edge from PTI goes to BB or not, fill in
       // PredCases and PredDefault with the new switch cases we would like to
       // build.
-      SmallVector<BasicBlock*, 8> NewSuccessors;
+      SmallVector<BasicBlock *, 8> NewSuccessors;
 
       // Update the branch weight metadata along the way
       SmallVector<uint64_t, 8> Weights;
@@ -915,7 +995,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
       if (PredDefault == BB) {
         // If this is the default destination from PTI, only the edges in TI
         // that don't occur in PTI, or that branch to BB will be activated.
-        std::set<ConstantInt*, ConstantIntOrdering> PTIHandled;
+        std::set<ConstantInt *, ConstantIntOrdering> PTIHandled;
         for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
           if (PredCases[i].Dest != BB)
             PTIHandled.insert(PredCases[i].Value);
@@ -925,13 +1005,14 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
 
             if (PredHasWeights || SuccHasWeights) {
               // Increase weight for the default case.
-              Weights[0] += Weights[i+1];
-              std::swap(Weights[i+1], Weights.back());
+              Weights[0] += Weights[i + 1];
+              std::swap(Weights[i + 1], Weights.back());
               Weights.pop_back();
             }
 
             PredCases.pop_back();
-            --i; --e;
+            --i;
+            --e;
           }
 
         // Reconstruct the new switch statement we will be building.
@@ -952,8 +1033,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
               // The default weight is at index 0, so weight for the ith case
               // should be at index i+1. Scale the cases from successor by
               // PredDefaultWeight (Weights[0]).
-              Weights.push_back(Weights[0] * SuccWeights[i+1]);
-              ValidTotalSuccWeight += SuccWeights[i+1];
+              Weights.push_back(Weights[0] * SuccWeights[i + 1]);
+              ValidTotalSuccWeight += SuccWeights[i + 1];
             }
           }
 
@@ -969,21 +1050,22 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
         // If this is not the default destination from PSI, only the edges
         // in SI that occur in PSI with a destination of BB will be
         // activated.
-        std::set<ConstantInt*, ConstantIntOrdering> PTIHandled;
-        std::map<ConstantInt*, uint64_t> WeightsForHandled;
+        std::set<ConstantInt *, ConstantIntOrdering> PTIHandled;
+        std::map<ConstantInt *, uint64_t> WeightsForHandled;
         for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
           if (PredCases[i].Dest == BB) {
             PTIHandled.insert(PredCases[i].Value);
 
             if (PredHasWeights || SuccHasWeights) {
-              WeightsForHandled[PredCases[i].Value] = Weights[i+1];
-              std::swap(Weights[i+1], Weights.back());
+              WeightsForHandled[PredCases[i].Value] = Weights[i + 1];
+              std::swap(Weights[i + 1], Weights.back());
               Weights.pop_back();
             }
 
             std::swap(PredCases[i], PredCases.back());
             PredCases.pop_back();
-            --i; --e;
+            --i;
+            --e;
           }
 
         // Okay, now we know which constants were sent to BB from the
@@ -995,17 +1077,16 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
               Weights.push_back(WeightsForHandled[BBCases[i].Value]);
             PredCases.push_back(BBCases[i]);
             NewSuccessors.push_back(BBCases[i].Dest);
-            PTIHandled.erase(BBCases[i].Value);// This constant is taken care of
+            PTIHandled.erase(
+                BBCases[i].Value); // This constant is taken care of
           }
 
         // If there are any constants vectored to BB that TI doesn't handle,
         // they must go to the default destination of TI.
-        for (std::set<ConstantInt*, ConstantIntOrdering>::iterator I =
-                                    PTIHandled.begin(),
-               E = PTIHandled.end(); I != E; ++I) {
+        for (ConstantInt *I : PTIHandled) {
           if (PredHasWeights || SuccHasWeights)
-            Weights.push_back(WeightsForHandled[*I]);
-          PredCases.push_back(ValueEqualityComparisonCase(*I, BBDefault));
+            Weights.push_back(WeightsForHandled[I]);
+          PredCases.push_back(ValueEqualityComparisonCase(I, BBDefault));
           NewSuccessors.push_back(BBDefault);
         }
       }
@@ -1024,8 +1105,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
       }
 
       // Now that the successors are updated, create the new Switch instruction.
-      SwitchInst *NewSI = Builder.CreateSwitch(CV, PredDefault,
-                                               PredCases.size());
+      SwitchInst *NewSI =
+          Builder.CreateSwitch(CV, PredDefault, PredCases.size());
       NewSI->setDebugLoc(PTI->getDebugLoc());
       for (ValueEqualityComparisonCase &V : PredCases)
         NewSI->addCase(V.Value, V.Dest);
@@ -1036,9 +1117,9 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
 
         SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
 
-        NewSI->setMetadata(LLVMContext::MD_prof,
-                           MDBuilder(BB->getContext()).
-                           createBranchWeights(MDWeights));
+        NewSI->setMetadata(
+            LLVMContext::MD_prof,
+            MDBuilder(BB->getContext()).createBranchWeights(MDWeights));
       }
 
       EraseTerminatorInstAndDCECond(PTI);
@@ -1052,8 +1133,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
           if (!InfLoopBlock) {
             // Insert it at the end of the function, because it's either code,
             // or it won't matter if it's hot. :)
-            InfLoopBlock = BasicBlock::Create(BB->getContext(),
-                                              "infloop", BB->getParent());
+            InfLoopBlock = BasicBlock::Create(BB->getContext(), "infloop",
+                                              BB->getParent());
             BranchInst::Create(InfLoopBlock, InfLoopBlock);
           }
           NewSI->setSuccessor(i, InfLoopBlock);
@@ -1070,13 +1151,13 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
 // can't hoist the invoke, as there is nowhere to put the select in this case.
 static bool isSafeToHoistInvoke(BasicBlock *BB1, BasicBlock *BB2,
                                 Instruction *I1, Instruction *I2) {
-  for (succ_iterator SI = succ_begin(BB1), E = succ_end(BB1); SI != E; ++SI) {
+  for (BasicBlock *Succ : successors(BB1)) {
     PHINode *PN;
-    for (BasicBlock::iterator BBI = SI->begin();
+    for (BasicBlock::iterator BBI = Succ->begin();
          (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
       Value *BB1V = PN->getIncomingValueForBlock(BB1);
       Value *BB2V = PN->getIncomingValueForBlock(BB2);
-      if (BB1V != BB2V && (BB1V==I1 || BB2V==I2)) {
+      if (BB1V != BB2V && (BB1V == I1 || BB2V == I2)) {
         return false;
       }
     }
@@ -1096,8 +1177,8 @@ static bool HoistThenElseCodeToIf(BranchInst *BI,
   // O(M*N) situations here where M and N are the sizes of BB1 and BB2.  As
   // such, we currently just scan for obviously identical instructions in an
   // identical order.
-  BasicBlock *BB1 = BI->getSuccessor(0);  // The true destination.
-  BasicBlock *BB2 = BI->getSuccessor(1);  // The false destination
+  BasicBlock *BB1 = BI->getSuccessor(0); // The true destination.
+  BasicBlock *BB2 = BI->getSuccessor(1); // The false destination
 
   BasicBlock::iterator BB1_Itr = BB1->begin();
   BasicBlock::iterator BB2_Itr = BB2->begin();
@@ -1135,12 +1216,16 @@ static bool HoistThenElseCodeToIf(BranchInst *BI,
     if (!I2->use_empty())
       I2->replaceAllUsesWith(I1);
     I1->intersectOptionalDataWith(I2);
-    unsigned KnownIDs[] = {
-        LLVMContext::MD_tbaa,    LLVMContext::MD_range,
-        LLVMContext::MD_fpmath,  LLVMContext::MD_invariant_load,
-        LLVMContext::MD_nonnull, LLVMContext::MD_invariant_group,
-        LLVMContext::MD_align,   LLVMContext::MD_dereferenceable,
-        LLVMContext::MD_dereferenceable_or_null};
+    unsigned KnownIDs[] = {LLVMContext::MD_tbaa,
+                           LLVMContext::MD_range,
+                           LLVMContext::MD_fpmath,
+                           LLVMContext::MD_invariant_load,
+                           LLVMContext::MD_nonnull,
+                           LLVMContext::MD_invariant_group,
+                           LLVMContext::MD_align,
+                           LLVMContext::MD_dereferenceable,
+                           LLVMContext::MD_dereferenceable_or_null,
+                           LLVMContext::MD_mem_parallel_loop_access};
     combineMetadata(I1, I2, KnownIDs);
     I2->eraseFromParent();
     Changed = true;
@@ -1165,9 +1250,9 @@ HoistTerminator:
   if (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2))
     return Changed;
 
-  for (succ_iterator SI = succ_begin(BB1), E = succ_end(BB1); SI != E; ++SI) {
+  for (BasicBlock *Succ : successors(BB1)) {
     PHINode *PN;
-    for (BasicBlock::iterator BBI = SI->begin();
+    for (BasicBlock::iterator BBI = Succ->begin();
          (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
       Value *BB1V = PN->getIncomingValueForBlock(BB1);
       Value *BB2V = PN->getIncomingValueForBlock(BB2);
@@ -1178,7 +1263,7 @@ HoistTerminator:
       // eliminate undefined control flow then converting it to a select.
       if (passingValueIsAlwaysUndefined(BB1V, PN) ||
           passingValueIsAlwaysUndefined(BB2V, PN))
-       return Changed;
+        return Changed;
 
       if (isa<ConstantExpr>(BB1V) && !isSafeToSpeculativelyExecute(BB1V))
         return Changed;
@@ -1196,27 +1281,28 @@ HoistTerminator:
     NT->takeName(I1);
   }
 
-  IRBuilder<true, NoFolder> Builder(NT);
+  IRBuilder<NoFolder> Builder(NT);
   // Hoisting one of the terminators from our successor is a great thing.
   // Unfortunately, the successors of the if/else blocks may have PHI nodes in
   // them.  If they do, all PHI entries for BB1/BB2 must agree for all PHI
   // nodes, so we insert select instruction to compute the final result.
-  std::map<std::pair<Value*,Value*>, SelectInst*> InsertedSelects;
-  for (succ_iterator SI = succ_begin(BB1), E = succ_end(BB1); SI != E; ++SI) {
+  std::map<std::pair<Value *, Value *>, SelectInst *> InsertedSelects;
+  for (BasicBlock *Succ : successors(BB1)) {
     PHINode *PN;
-    for (BasicBlock::iterator BBI = SI->begin();
+    for (BasicBlock::iterator BBI = Succ->begin();
          (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
       Value *BB1V = PN->getIncomingValueForBlock(BB1);
       Value *BB2V = PN->getIncomingValueForBlock(BB2);
-      if (BB1V == BB2V) continue;
+      if (BB1V == BB2V)
+        continue;
 
       // These values do not agree.  Insert a select instruction before NT
       // that determines the right value.
       SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)];
       if (!SI)
-        SI = cast<SelectInst>
-          (Builder.CreateSelect(BI->getCondition(), BB1V, BB2V,
-                                BB1V->getName()+"."+BB2V->getName()));
+        SI = cast<SelectInst>(
+            Builder.CreateSelect(BI->getCondition(), BB1V, BB2V,
+                                 BB1V->getName() + "." + BB2V->getName(), BI));
 
       // Make the PHI node use the select for all incoming values for BB1/BB2
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
@@ -1226,8 +1312,8 @@ HoistTerminator:
   }
 
   // Update any PHI nodes in our new successors.
-  for (succ_iterator SI = succ_begin(BB1), E = succ_end(BB1); SI != E; ++SI)
-    AddPredecessorToBlock(*SI, BIParent, BB1);
+  for (BasicBlock *Succ : successors(BB1))
+    AddPredecessorToBlock(Succ, BIParent, BB1);
 
   EraseTerminatorInstAndDCECond(BI);
   return true;
@@ -1280,10 +1366,12 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
                                              RI2 = BB2->getInstList().rbegin(),
                                              RE2 = BB2->getInstList().rend();
   // Skip debug info.
-  while (RI1 != RE1 && isa<DbgInfoIntrinsic>(&*RI1)) ++RI1;
+  while (RI1 != RE1 && isa<DbgInfoIntrinsic>(&*RI1))
+    ++RI1;
   if (RI1 == RE1)
     return false;
-  while (RI2 != RE2 && isa<DbgInfoIntrinsic>(&*RI2)) ++RI2;
+  while (RI2 != RE2 && isa<DbgInfoIntrinsic>(&*RI2))
+    ++RI2;
   if (RI2 == RE2)
     return false;
   // Skip the unconditional branches.
@@ -1293,10 +1381,12 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
   bool Changed = false;
   while (RI1 != RE1 && RI2 != RE2) {
     // Skip debug info.
-    while (RI1 != RE1 && isa<DbgInfoIntrinsic>(&*RI1)) ++RI1;
+    while (RI1 != RE1 && isa<DbgInfoIntrinsic>(&*RI1))
+      ++RI1;
     if (RI1 == RE1)
       return Changed;
-    while (RI2 != RE2 && isa<DbgInfoIntrinsic>(&*RI2)) ++RI2;
+    while (RI2 != RE2 && isa<DbgInfoIntrinsic>(&*RI2))
+      ++RI2;
     if (RI2 == RE2)
       return Changed;
 
@@ -1305,22 +1395,19 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
     // I1 and I2 should have a single use in the same PHI node, and they
     // perform the same operation.
     // Cannot move control-flow-involving, volatile loads, vaarg, etc.
-    if (isa<PHINode>(I1) || isa<PHINode>(I2) ||
-        isa<TerminatorInst>(I1) || isa<TerminatorInst>(I2) ||
-        I1->isEHPad() || I2->isEHPad() ||
+    if (isa<PHINode>(I1) || isa<PHINode>(I2) || isa<TerminatorInst>(I1) ||
+        isa<TerminatorInst>(I2) || I1->isEHPad() || I2->isEHPad() ||
         isa<AllocaInst>(I1) || isa<AllocaInst>(I2) ||
         I1->mayHaveSideEffects() || I2->mayHaveSideEffects() ||
         I1->mayReadOrWriteMemory() || I2->mayReadOrWriteMemory() ||
-        !I1->hasOneUse() || !I2->hasOneUse() ||
-        !JointValueMap.count(InstPair))
+        !I1->hasOneUse() || !I2->hasOneUse() || !JointValueMap.count(InstPair))
       return Changed;
 
     // Check whether we should swap the operands of ICmpInst.
     // TODO: Add support of communativity.
     ICmpInst *ICmp1 = dyn_cast<ICmpInst>(I1), *ICmp2 = dyn_cast<ICmpInst>(I2);
     bool SwapOpnds = false;
-    if (ICmp1 && ICmp2 &&
-        ICmp1->getOperand(0) != ICmp2->getOperand(0) &&
+    if (ICmp1 && ICmp2 && ICmp1->getOperand(0) != ICmp2->getOperand(0) &&
         ICmp1->getOperand(1) != ICmp2->getOperand(1) &&
         (ICmp1->getOperand(0) == ICmp2->getOperand(1) ||
          ICmp1->getOperand(1) == ICmp2->getOperand(0))) {
@@ -1343,8 +1430,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
         continue;
       // Early exit if we have more-than one pair of different operands or if
       // we need a PHI node to replace a constant.
-      if (Op1Idx != ~0U ||
-          isa<Constant>(I1->getOperand(I)) ||
+      if (Op1Idx != ~0U || isa<Constant>(I1->getOperand(I)) ||
           isa<Constant>(I2->getOperand(I))) {
         // If we can't sink the instructions, undo the swapping.
         if (SwapOpnds)
@@ -1379,7 +1465,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
 
     // We need to update RE1 and RE2 if we are going to sink the first
     // instruction in the basic block down.
-    bool UpdateRE1 = (I1 == BB1->begin()), UpdateRE2 = (I2 == BB2->begin());
+    bool UpdateRE1 = (I1 == &BB1->front()), UpdateRE2 = (I2 == &BB2->front());
     // Sink the instruction.
     BBEnd->getInstList().splice(FirstNonPhiInBBEnd->getIterator(),
                                 BB1->getInstList(), I1);
@@ -1444,22 +1530,26 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
   Value *StorePtr = StoreToHoist->getPointerOperand();
 
   // Look for a store to the same pointer in BrBB.
-  unsigned MaxNumInstToLookAt = 10;
-  for (BasicBlock::reverse_iterator RI = BrBB->rbegin(),
-       RE = BrBB->rend(); RI != RE && (--MaxNumInstToLookAt); ++RI) {
-    Instruction *CurI = &*RI;
+  unsigned MaxNumInstToLookAt = 9;
+  for (Instruction &CurI : reverse(*BrBB)) {
+    if (!MaxNumInstToLookAt)
+      break;
+    // Skip debug info.
+    if (isa<DbgInfoIntrinsic>(CurI))
+      continue;
+    --MaxNumInstToLookAt;
 
     // Could be calling an instruction that effects memory like free().
-    if (CurI->mayHaveSideEffects() && !isa<StoreInst>(CurI))
+    if (CurI.mayHaveSideEffects() && !isa<StoreInst>(CurI))
       return nullptr;
 
-    StoreInst *SI = dyn_cast<StoreInst>(CurI);
-    // Found the previous store make sure it stores to the same location.
-    if (SI && SI->getPointerOperand() == StorePtr)
-      // Found the previous store, return its value operand.
-      return SI->getValueOperand();
-    else if (SI)
+    if (auto *SI = dyn_cast<StoreInst>(&CurI)) {
+      // Found the previous store make sure it stores to the same location.
+      if (SI->getPointerOperand() == StorePtr)
+        // Found the previous store, return its value operand.
+        return SI->getValueOperand();
       return nullptr; // Unknown store.
+    }
   }
 
   return nullptr;
@@ -1562,11 +1652,9 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
     // Do not hoist the instruction if any of its operands are defined but not
     // used in BB. The transformation will prevent the operand from
     // being sunk into the use block.
-    for (User::op_iterator i = I->op_begin(), e = I->op_end();
-         i != e; ++i) {
+    for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i) {
       Instruction *OpI = dyn_cast<Instruction>(*i);
-      if (!OpI || OpI->getParent() != BB ||
-          OpI->mayHaveSideEffects())
+      if (!OpI || OpI->getParent() != BB || OpI->mayHaveSideEffects())
         continue; // Not a candidate for sinking.
 
       ++SinkCandidateUseCounts[OpI];
@@ -1576,8 +1664,9 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
   // Consider any sink candidates which are only used in CondBB as costs for
   // speculation. Note, while we iterate over a DenseMap here, we are summing
   // and so iteration order isn't significant.
-  for (SmallDenseMap<Instruction *, unsigned, 4>::iterator I =
-           SinkCandidateUseCounts.begin(), E = SinkCandidateUseCounts.end();
+  for (SmallDenseMap<Instruction *, unsigned, 4>::iterator
+           I = SinkCandidateUseCounts.begin(),
+           E = SinkCandidateUseCounts.end();
        I != E; ++I)
     if (I->first->getNumUses() == I->second) {
       ++SpeculationCost;
@@ -1613,8 +1702,8 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
       return false;
     unsigned OrigCost = OrigCE ? ComputeSpeculationCost(OrigCE, TTI) : 0;
     unsigned ThenCost = ThenCE ? ComputeSpeculationCost(ThenCE, TTI) : 0;
-    unsigned MaxCost = 2 * PHINodeFoldingThreshold *
-      TargetTransformInfo::TCC_Basic;
+    unsigned MaxCost =
+        2 * PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
     if (OrigCost + ThenCost > MaxCost)
       return false;
 
@@ -1637,19 +1726,19 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
 
   // Insert a select of the value of the speculated store.
   if (SpeculatedStoreValue) {
-    IRBuilder<true, NoFolder> Builder(BI);
+    IRBuilder<NoFolder> Builder(BI);
     Value *TrueV = SpeculatedStore->getValueOperand();
     Value *FalseV = SpeculatedStoreValue;
     if (Invert)
       std::swap(TrueV, FalseV);
-    Value *S = Builder.CreateSelect(BrCond, TrueV, FalseV, TrueV->getName() +
-                                    "." + FalseV->getName());
+    Value *S = Builder.CreateSelect(
+        BrCond, TrueV, FalseV, TrueV->getName() + "." + FalseV->getName(), BI);
     SpeculatedStore->setOperand(0, S);
   }
 
   // Metadata can be dependent on the condition we are hoisting above.
   // Conservatively strip all metadata on the instruction.
-  for (auto &I: *ThenBB)
+  for (auto &I : *ThenBB)
     I.dropUnknownNonDebugMetadata();
 
   // Hoist the instructions.
@@ -1657,7 +1746,7 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
                            ThenBB->begin(), std::prev(ThenBB->end()));
 
   // Insert selects and rewrite the PHI operands.
-  IRBuilder<true, NoFolder> Builder(BI);
+  IRBuilder<NoFolder> Builder(BI);
   for (BasicBlock::iterator I = EndBB->begin();
        PHINode *PN = dyn_cast<PHINode>(I); ++I) {
     unsigned OrigI = PN->getBasicBlockIndex(BB);
@@ -1675,8 +1764,8 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
     Value *TrueV = ThenV, *FalseV = OrigV;
     if (Invert)
       std::swap(TrueV, FalseV);
-    Value *V = Builder.CreateSelect(BrCond, TrueV, FalseV,
-                                    TrueV->getName() + "." + FalseV->getName());
+    Value *V = Builder.CreateSelect(
+        BrCond, TrueV, FalseV, TrueV->getName() + "." + FalseV->getName(), BI);
     PN->setIncomingValue(OrigI, V);
     PN->setIncomingValue(ThenI, V);
   }
@@ -1685,19 +1774,6 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
   return true;
 }
 
-/// \returns True if this block contains a CallInst with the NoDuplicate
-/// attribute.
-static bool HasNoDuplicateCall(const BasicBlock *BB) {
-  for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-    const CallInst *CI = dyn_cast<CallInst>(I);
-    if (!CI)
-      continue;
-    if (CI->cannotDuplicate())
-      return true;
-  }
-  return false;
-}
-
 /// Return true if we can thread a branch across this block.
 static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
   BranchInst *BI = cast<BranchInst>(BB->getTerminator());
@@ -1706,14 +1782,16 @@ static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
   for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) {
     if (isa<DbgInfoIntrinsic>(BBI))
       continue;
-    if (Size > 10) return false;  // Don't clone large BB's.
+    if (Size > 10)
+      return false; // Don't clone large BB's.
     ++Size;
 
     // We can only support instructions that do not define values that are
     // live outside of the current basic block.
     for (User *U : BBI->users()) {
       Instruction *UI = cast<Instruction>(U);
-      if (UI->getParent() != BB || isa<PHINode>(UI)) return false;
+      if (UI->getParent() != BB || isa<PHINode>(UI))
+        return false;
     }
 
     // Looks ok, continue checking.
@@ -1740,32 +1818,41 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL) {
   }
 
   // Now we know that this block has multiple preds and two succs.
-  if (!BlockIsSimpleEnoughToThreadThrough(BB)) return false;
+  if (!BlockIsSimpleEnoughToThreadThrough(BB))
+    return false;
 
-  if (HasNoDuplicateCall(BB)) return false;
+  // Can't fold blocks that contain noduplicate or convergent calls.
+  if (llvm::any_of(*BB, [](const Instruction &I) {
+        const CallInst *CI = dyn_cast<CallInst>(&I);
+        return CI && (CI->cannotDuplicate() || CI->isConvergent());
+      }))
+    return false;
 
   // Okay, this is a simple enough basic block.  See if any phi values are
   // constants.
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
     ConstantInt *CB = dyn_cast<ConstantInt>(PN->getIncomingValue(i));
-    if (!CB || !CB->getType()->isIntegerTy(1)) continue;
+    if (!CB || !CB->getType()->isIntegerTy(1))
+      continue;
 
     // Okay, we now know that all edges from PredBB should be revectored to
     // branch to RealDest.
     BasicBlock *PredBB = PN->getIncomingBlock(i);
     BasicBlock *RealDest = BI->getSuccessor(!CB->getZExtValue());
 
-    if (RealDest == BB) continue;  // Skip self loops.
+    if (RealDest == BB)
+      continue; // Skip self loops.
     // Skip if the predecessor's terminator is an indirect branch.
-    if (isa<IndirectBrInst>(PredBB->getTerminator())) continue;
+    if (isa<IndirectBrInst>(PredBB->getTerminator()))
+      continue;
 
     // The dest block might have PHI nodes, other predecessors and other
     // difficult cases.  Instead of being smart about this, just insert a new
     // block that jumps to the destination block, effectively splitting
     // the edge we are about to create.
-    BasicBlock *EdgeBB = BasicBlock::Create(BB->getContext(),
-                                            RealDest->getName()+".critedge",
-                                            RealDest->getParent(), RealDest);
+    BasicBlock *EdgeBB =
+        BasicBlock::Create(BB->getContext(), RealDest->getName() + ".critedge",
+                           RealDest->getParent(), RealDest);
     BranchInst::Create(RealDest, EdgeBB);
 
     // Update PHI nodes.
@@ -1775,7 +1862,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL) {
     // instructions into EdgeBB.  We know that there will be no uses of the
     // cloned instructions outside of EdgeBB.
     BasicBlock::iterator InsertPt = EdgeBB->begin();
-    DenseMap<Value*, Value*> TranslateMap;  // Track translated values.
+    DenseMap<Value *, Value *> TranslateMap; // Track translated values.
     for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) {
       if (PHINode *PN = dyn_cast<PHINode>(BBI)) {
         TranslateMap[PN] = PN->getIncomingValueForBlock(PredBB);
@@ -1783,26 +1870,31 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL) {
       }
       // Clone the instruction.
       Instruction *N = BBI->clone();
-      if (BBI->hasName()) N->setName(BBI->getName()+".c");
+      if (BBI->hasName())
+        N->setName(BBI->getName() + ".c");
 
       // Update operands due to translation.
-      for (User::op_iterator i = N->op_begin(), e = N->op_end();
-           i != e; ++i) {
-        DenseMap<Value*, Value*>::iterator PI = TranslateMap.find(*i);
+      for (User::op_iterator i = N->op_begin(), e = N->op_end(); i != e; ++i) {
+        DenseMap<Value *, Value *>::iterator PI = TranslateMap.find(*i);
         if (PI != TranslateMap.end())
           *i = PI->second;
       }
 
       // Check for trivial simplification.
       if (Value *V = SimplifyInstruction(N, DL)) {
-        TranslateMap[&*BBI] = V;
-        delete N;   // Instruction folded away, don't need actual inst
+        if (!BBI->use_empty())
+          TranslateMap[&*BBI] = V;
+        if (!N->mayHaveSideEffects()) {
+          delete N; // Instruction folded away, don't need actual inst
+          N = nullptr;
+        }
       } else {
-        // Insert the new instruction into its new home.
-        EdgeBB->getInstList().insert(InsertPt, N);
         if (!BBI->use_empty())
           TranslateMap[&*BBI] = N;
       }
+      // Insert the new instruction into its new home.
+      if (N)
+        EdgeBB->getInstList().insert(InsertPt, N);
     }
 
     // Loop over all of the edges from PredBB to BB, changing them to branch
@@ -1852,7 +1944,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   // Loop over the PHI's seeing if we can promote them all to select
   // instructions.  While we are at it, keep track of the instructions
   // that need to be moved to the dominating block.
-  SmallPtrSet<Instruction*, 4> AggressiveInsts;
+  SmallPtrSet<Instruction *, 4> AggressiveInsts;
   unsigned MaxCostVal0 = PHINodeFoldingThreshold,
            MaxCostVal1 = PHINodeFoldingThreshold;
   MaxCostVal0 *= TargetTransformInfo::TCC_Basic;
@@ -1876,7 +1968,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   // If we folded the first phi, PN dangles at this point.  Refresh it.  If
   // we ran out of PHIs then we simplified them all.
   PN = dyn_cast<PHINode>(BB->begin());
-  if (!PN) return true;
+  if (!PN)
+    return true;
 
   // Don't fold i1 branches on PHIs which contain binary operators.  These can
   // often be turned into switches and other things.
@@ -1886,10 +1979,10 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
        isa<BinaryOperator>(IfCond)))
     return false;
 
-  // If we all PHI nodes are promotable, check to make sure that all
-  // instructions in the predecessor blocks can be promoted as well.  If
-  // not, we won't be able to get rid of the control flow, so it's not
-  // worth promoting to select instructions.
+  // If all PHI nodes are promotable, check to make sure that all instructions
+  // in the predecessor blocks can be promoted as well. If not, we won't be able
+  // to get rid of the control flow, so it's not worth promoting to select
+  // instructions.
   BasicBlock *DomBlock = nullptr;
   BasicBlock *IfBlock1 = PN->getIncomingBlock(0);
   BasicBlock *IfBlock2 = PN->getIncomingBlock(1);
@@ -1897,11 +1990,12 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
     IfBlock1 = nullptr;
   } else {
     DomBlock = *pred_begin(IfBlock1);
-    for (BasicBlock::iterator I = IfBlock1->begin();!isa<TerminatorInst>(I);++I)
+    for (BasicBlock::iterator I = IfBlock1->begin(); !isa<TerminatorInst>(I);
+         ++I)
       if (!AggressiveInsts.count(&*I) && !isa<DbgInfoIntrinsic>(I)) {
         // This is not an aggressive instruction that we can promote.
-        // Because of this, we won't be able to get rid of the control
-        // flow, so the xform is not worth it.
+        // Because of this, we won't be able to get rid of the control flow, so
+        // the xform is not worth it.
         return false;
       }
   }
@@ -1910,11 +2004,12 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
     IfBlock2 = nullptr;
   } else {
     DomBlock = *pred_begin(IfBlock2);
-    for (BasicBlock::iterator I = IfBlock2->begin();!isa<TerminatorInst>(I);++I)
+    for (BasicBlock::iterator I = IfBlock2->begin(); !isa<TerminatorInst>(I);
+         ++I)
       if (!AggressiveInsts.count(&*I) && !isa<DbgInfoIntrinsic>(I)) {
         // This is not an aggressive instruction that we can promote.
-        // Because of this, we won't be able to get rid of the control
-        // flow, so the xform is not worth it.
+        // Because of this, we won't be able to get rid of the control flow, so
+        // the xform is not worth it.
         return false;
       }
   }
@@ -1925,7 +2020,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   // If we can still promote the PHI nodes after this gauntlet of tests,
   // do all of the PHI's now.
   Instruction *InsertPt = DomBlock->getTerminator();
-  IRBuilder<true, NoFolder> Builder(InsertPt);
+  IRBuilder<NoFolder> Builder(InsertPt);
 
   // Move all 'aggressive' instructions, which are defined in the
   // conditional parts of the if's up to the dominating block.
@@ -1940,13 +2035,12 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
 
   while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
     // Change the PHI node into a select instruction.
-    Value *TrueVal  = PN->getIncomingValue(PN->getIncomingBlock(0) == IfFalse);
+    Value *TrueVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfFalse);
     Value *FalseVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfTrue);
 
-    SelectInst *NV =
-      cast<SelectInst>(Builder.CreateSelect(IfCond, TrueVal, FalseVal, ""));
-    PN->replaceAllUsesWith(NV);
-    NV->takeName(PN);
+    Value *Sel = Builder.CreateSelect(IfCond, TrueVal, FalseVal, "", InsertPt);
+    PN->replaceAllUsesWith(Sel);
+    Sel->takeName(PN);
     PN->eraseFromParent();
   }
 
@@ -2029,51 +2123,32 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
     } else if (isa<UndefValue>(TrueValue)) {
       TrueValue = FalseValue;
     } else {
-      TrueValue = Builder.CreateSelect(BrCond, TrueValue,
-                                       FalseValue, "retval");
+      TrueValue =
+          Builder.CreateSelect(BrCond, TrueValue, FalseValue, "retval", BI);
     }
   }
 
-  Value *RI = !TrueValue ?
-    Builder.CreateRetVoid() : Builder.CreateRet(TrueValue);
+  Value *RI =
+      !TrueValue ? Builder.CreateRetVoid() : Builder.CreateRet(TrueValue);
 
-  (void) RI;
+  (void)RI;
 
   DEBUG(dbgs() << "\nCHANGING BRANCH TO TWO RETURNS INTO SELECT:"
                << "\n  " << *BI << "NewRet = " << *RI
-               << "TRUEBLOCK: " << *TrueSucc << "FALSEBLOCK: "<< *FalseSucc);
+               << "TRUEBLOCK: " << *TrueSucc << "FALSEBLOCK: " << *FalseSucc);
 
   EraseTerminatorInstAndDCECond(BI);
 
   return true;
 }
 
-/// Given a conditional BranchInstruction, retrieve the probabilities of the
-/// branch taking each edge. Fills in the two APInt parameters and returns true,
-/// or returns false if no or invalid metadata was found.
-static bool ExtractBranchMetadata(BranchInst *BI,
-                                  uint64_t &ProbTrue, uint64_t &ProbFalse) {
-  assert(BI->isConditional() &&
-         "Looking for probabilities on unconditional branch?");
-  MDNode *ProfileData = BI->getMetadata(LLVMContext::MD_prof);
-  if (!ProfileData || ProfileData->getNumOperands() != 3) return false;
-  ConstantInt *CITrue =
-      mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(1));
-  ConstantInt *CIFalse =
-      mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(2));
-  if (!CITrue || !CIFalse) return false;
-  ProbTrue = CITrue->getValue().getZExtValue();
-  ProbFalse = CIFalse->getValue().getZExtValue();
-  return true;
-}
-
 /// Return true if the given instruction is available
 /// in its predecessor block. If yes, the instruction will be removed.
 static bool checkCSEInPredecessor(Instruction *Inst, BasicBlock *PB) {
   if (!isa<BinaryOperator>(Inst) && !isa<CmpInst>(Inst))
     return false;
-  for (BasicBlock::iterator I = PB->begin(), E = PB->end(); I != E; I++) {
-    Instruction *PBI = &*I;
+  for (Instruction &I : *PB) {
+    Instruction *PBI = &I;
     // Check whether Inst and PBI generate the same value.
     if (Inst->isIdenticalTo(PBI)) {
       Inst->replaceAllUsesWith(PBI);
@@ -2084,6 +2159,29 @@ static bool checkCSEInPredecessor(Instruction *Inst, BasicBlock *PB) {
   return false;
 }
 
+/// Return true if either PBI or BI has branch weight available, and store
+/// the weights in {Pred|Succ}{True|False}Weight. If one of PBI and BI does
+/// not have branch weight, use 1:1 as its weight.
+static bool extractPredSuccWeights(BranchInst *PBI, BranchInst *BI,
+                                   uint64_t &PredTrueWeight,
+                                   uint64_t &PredFalseWeight,
+                                   uint64_t &SuccTrueWeight,
+                                   uint64_t &SuccFalseWeight) {
+  bool PredHasWeights =
+      PBI->extractProfMetadata(PredTrueWeight, PredFalseWeight);
+  bool SuccHasWeights =
+      BI->extractProfMetadata(SuccTrueWeight, SuccFalseWeight);
+  if (PredHasWeights || SuccHasWeights) {
+    if (!PredHasWeights)
+      PredTrueWeight = PredFalseWeight = 1;
+    if (!SuccHasWeights)
+      SuccTrueWeight = SuccFalseWeight = 1;
+    return true;
+  } else {
+    return false;
+  }
+}
+
 /// If this basic block is simple enough, and if a predecessor branches to us
 /// and one of our successors, fold the block into the predecessor and use
 /// logical operations to pick the right destination.
@@ -2103,8 +2201,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
         if (PBI->isConditional() &&
             (BI->getSuccessor(0) == PBI->getSuccessor(0) ||
              BI->getSuccessor(0) == PBI->getSuccessor(1))) {
-          for (BasicBlock::iterator I = BB->begin(), E = BB->end();
-               I != E; ) {
+          for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
             Instruction *Curr = &*I++;
             if (isa<CmpInst>(Curr)) {
               Cond = Curr;
@@ -2122,13 +2219,14 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
 
   if (!Cond || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) ||
       Cond->getParent() != BB || !Cond->hasOneUse())
-  return false;
+    return false;
 
   // Make sure the instruction after the condition is the cond branch.
   BasicBlock::iterator CondIt = ++Cond->getIterator();
 
   // Ignore dbg intrinsics.
-  while (isa<DbgInfoIntrinsic>(CondIt)) ++CondIt;
+  while (isa<DbgInfoIntrinsic>(CondIt))
+    ++CondIt;
 
   if (&*CondIt != BI)
     return false;
@@ -2139,7 +2237,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
   // as "bonus instructions", and only allow this transformation when the
   // number of the bonus instructions does not exceed a certain threshold.
   unsigned NumBonusInsts = 0;
-  for (auto I = BB->begin(); Cond != I; ++I) {
+  for (auto I = BB->begin(); Cond != &*I; ++I) {
     // Ignore dbg intrinsics.
     if (isa<DbgInfoIntrinsic>(I))
       continue;
@@ -2168,7 +2266,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
       return false;
 
   // Finally, don't infinitely unroll conditional loops.
-  BasicBlock *TrueDest  = BI->getSuccessor(0);
+  BasicBlock *TrueDest = BI->getSuccessor(0);
   BasicBlock *FalseDest = (BI->isConditional()) ? BI->getSuccessor(1) : nullptr;
   if (TrueDest == BB || FalseDest == BB)
     return false;
@@ -2180,10 +2278,9 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
     // Check that we have two conditional branches.  If there is a PHI node in
     // the common successor, verify that the same value flows in from both
     // blocks.
-    SmallVector<PHINode*, 4> PHIs;
+    SmallVector<PHINode *, 4> PHIs;
     if (!PBI || PBI->isUnconditional() ||
-        (BI->isConditional() &&
-         !SafeToMergeTerminators(BI, PBI)) ||
+        (BI->isConditional() && !SafeToMergeTerminators(BI, PBI)) ||
         (!BI->isConditional() &&
          !isProfitableToFoldUnconditional(BI, PBI, Cond, PHIs)))
       continue;
@@ -2193,16 +2290,19 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
     bool InvertPredCond = false;
 
     if (BI->isConditional()) {
-      if (PBI->getSuccessor(0) == TrueDest)
+      if (PBI->getSuccessor(0) == TrueDest) {
         Opc = Instruction::Or;
-      else if (PBI->getSuccessor(1) == FalseDest)
+      } else if (PBI->getSuccessor(1) == FalseDest) {
         Opc = Instruction::And;
-      else if (PBI->getSuccessor(0) == FalseDest)
-        Opc = Instruction::And, InvertPredCond = true;
-      else if (PBI->getSuccessor(1) == TrueDest)
-        Opc = Instruction::Or, InvertPredCond = true;
-      else
+      } else if (PBI->getSuccessor(0) == FalseDest) {
+        Opc = Instruction::And;
+        InvertPredCond = true;
+      } else if (PBI->getSuccessor(1) == TrueDest) {
+        Opc = Instruction::Or;
+        InvertPredCond = true;
+      } else {
         continue;
+      }
     } else {
       if (PBI->getSuccessor(0) != TrueDest && PBI->getSuccessor(1) != TrueDest)
         continue;
@@ -2219,8 +2319,8 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
         CmpInst *CI = cast<CmpInst>(NewCond);
         CI->setPredicate(CI->getInversePredicate());
       } else {
-        NewCond = Builder.CreateNot(NewCond,
-                                    PBI->getCondition()->getName()+".not");
+        NewCond =
+            Builder.CreateNot(NewCond, PBI->getCondition()->getName() + ".not");
       }
 
       PBI->setCondition(NewCond);
@@ -2234,12 +2334,12 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
     // We already make sure Cond is the last instruction before BI. Therefore,
     // all instructions before Cond other than DbgInfoIntrinsic are bonus
     // instructions.
-    for (auto BonusInst = BB->begin(); Cond != BonusInst; ++BonusInst) {
+    for (auto BonusInst = BB->begin(); Cond != &*BonusInst; ++BonusInst) {
       if (isa<DbgInfoIntrinsic>(BonusInst))
         continue;
       Instruction *NewBonusInst = BonusInst->clone();
       RemapInstruction(NewBonusInst, VMap,
-                       RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
       VMap[&*BonusInst] = NewBonusInst;
 
       // If we moved a load, we cannot any longer claim any knowledge about
@@ -2258,49 +2358,49 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
     // two conditions together.
     Instruction *New = Cond->clone();
     RemapInstruction(New, VMap,
-                     RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+                     RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
     PredBlock->getInstList().insert(PBI->getIterator(), New);
     New->takeName(Cond);
     Cond->setName(New->getName() + ".old");
 
     if (BI->isConditional()) {
-      Instruction *NewCond =
-        cast<Instruction>(Builder.CreateBinOp(Opc, PBI->getCondition(),
-                                            New, "or.cond"));
+      Instruction *NewCond = cast<Instruction>(
+          Builder.CreateBinOp(Opc, PBI->getCondition(), New, "or.cond"));
       PBI->setCondition(NewCond);
 
       uint64_t PredTrueWeight, PredFalseWeight, SuccTrueWeight, SuccFalseWeight;
-      bool PredHasWeights = ExtractBranchMetadata(PBI, PredTrueWeight,
-                                                  PredFalseWeight);
-      bool SuccHasWeights = ExtractBranchMetadata(BI, SuccTrueWeight,
-                                                  SuccFalseWeight);
+      bool HasWeights =
+          extractPredSuccWeights(PBI, BI, PredTrueWeight, PredFalseWeight,
+                                 SuccTrueWeight, SuccFalseWeight);
       SmallVector<uint64_t, 8> NewWeights;
 
       if (PBI->getSuccessor(0) == BB) {
-        if (PredHasWeights && SuccHasWeights) {
+        if (HasWeights) {
           // PBI: br i1 %x, BB, FalseDest
           // BI:  br i1 %y, TrueDest, FalseDest
-          //TrueWeight is TrueWeight for PBI * TrueWeight for BI.
+          // TrueWeight is TrueWeight for PBI * TrueWeight for BI.
           NewWeights.push_back(PredTrueWeight * SuccTrueWeight);
-          //FalseWeight is FalseWeight for PBI * TotalWeight for BI +
+          // FalseWeight is FalseWeight for PBI * TotalWeight for BI +
           //               TrueWeight for PBI * FalseWeight for BI.
           // We assume that total weights of a BranchInst can fit into 32 bits.
           // Therefore, we will not have overflow using 64-bit arithmetic.
-          NewWeights.push_back(PredFalseWeight * (SuccFalseWeight +
-               SuccTrueWeight) + PredTrueWeight * SuccFalseWeight);
+          NewWeights.push_back(PredFalseWeight *
+                                   (SuccFalseWeight + SuccTrueWeight) +
+                               PredTrueWeight * SuccFalseWeight);
         }
         AddPredecessorToBlock(TrueDest, PredBlock, BB);
         PBI->setSuccessor(0, TrueDest);
       }
       if (PBI->getSuccessor(1) == BB) {
-        if (PredHasWeights && SuccHasWeights) {
+        if (HasWeights) {
           // PBI: br i1 %x, TrueDest, BB
           // BI:  br i1 %y, TrueDest, FalseDest
-          //TrueWeight is TrueWeight for PBI * TotalWeight for BI +
+          // TrueWeight is TrueWeight for PBI * TotalWeight for BI +
           //              FalseWeight for PBI * TrueWeight for BI.
-          NewWeights.push_back(PredTrueWeight * (SuccFalseWeight +
-              SuccTrueWeight) + PredFalseWeight * SuccTrueWeight);
-          //FalseWeight is FalseWeight for PBI * FalseWeight for BI.
+          NewWeights.push_back(PredTrueWeight *
+                                   (SuccFalseWeight + SuccTrueWeight) +
+                               PredFalseWeight * SuccTrueWeight);
+          // FalseWeight is FalseWeight for PBI * FalseWeight for BI.
           NewWeights.push_back(PredFalseWeight * SuccFalseWeight);
         }
         AddPredecessorToBlock(FalseDest, PredBlock, BB);
@@ -2310,51 +2410,42 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
         // Halve the weights if any of them cannot fit in an uint32_t
         FitWeights(NewWeights);
 
-        SmallVector<uint32_t, 8> MDWeights(NewWeights.begin(),NewWeights.end());
-        PBI->setMetadata(LLVMContext::MD_prof,
-                         MDBuilder(BI->getContext()).
-                         createBranchWeights(MDWeights));
+        SmallVector<uint32_t, 8> MDWeights(NewWeights.begin(),
+                                           NewWeights.end());
+        PBI->setMetadata(
+            LLVMContext::MD_prof,
+            MDBuilder(BI->getContext()).createBranchWeights(MDWeights));
       } else
         PBI->setMetadata(LLVMContext::MD_prof, nullptr);
     } else {
       // Update PHI nodes in the common successors.
       for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
         ConstantInt *PBI_C = cast<ConstantInt>(
-          PHIs[i]->getIncomingValueForBlock(PBI->getParent()));
+            PHIs[i]->getIncomingValueForBlock(PBI->getParent()));
         assert(PBI_C->getType()->isIntegerTy(1));
         Instruction *MergedCond = nullptr;
         if (PBI->getSuccessor(0) == TrueDest) {
           // Create (PBI_Cond and PBI_C) or (!PBI_Cond and BI_Value)
           // PBI_C is true: PBI_Cond or (!PBI_Cond and BI_Value)
           //       is false: !PBI_Cond and BI_Value
-          Instruction *NotCond =
-            cast<Instruction>(Builder.CreateNot(PBI->getCondition(),
-                                "not.cond"));
-          MergedCond =
-            cast<Instruction>(Builder.CreateBinOp(Instruction::And,
-                                NotCond, New,
-                                "and.cond"));
+          Instruction *NotCond = cast<Instruction>(
+              Builder.CreateNot(PBI->getCondition(), "not.cond"));
+          MergedCond = cast<Instruction>(
+              Builder.CreateBinOp(Instruction::And, NotCond, New, "and.cond"));
           if (PBI_C->isOne())
-            MergedCond =
-              cast<Instruction>(Builder.CreateBinOp(Instruction::Or,
-                                  PBI->getCondition(), MergedCond,
-                                  "or.cond"));
+            MergedCond = cast<Instruction>(Builder.CreateBinOp(
+                Instruction::Or, PBI->getCondition(), MergedCond, "or.cond"));
         } else {
           // Create (PBI_Cond and BI_Value) or (!PBI_Cond and PBI_C)
           // PBI_C is true: (PBI_Cond and BI_Value) or (!PBI_Cond)
           //       is false: PBI_Cond and BI_Value
-          MergedCond =
-            cast<Instruction>(Builder.CreateBinOp(Instruction::And,
-                                PBI->getCondition(), New,
-                                "and.cond"));
+          MergedCond = cast<Instruction>(Builder.CreateBinOp(
+              Instruction::And, PBI->getCondition(), New, "and.cond"));
           if (PBI_C->isOne()) {
-            Instruction *NotCond =
-              cast<Instruction>(Builder.CreateNot(PBI->getCondition(),
-                                  "not.cond"));
-            MergedCond =
-              cast<Instruction>(Builder.CreateBinOp(Instruction::Or,
-                                  NotCond, MergedCond,
-                                  "or.cond"));
+            Instruction *NotCond = cast<Instruction>(
+                Builder.CreateNot(PBI->getCondition(), "not.cond"));
+            MergedCond = cast<Instruction>(Builder.CreateBinOp(
+                Instruction::Or, NotCond, MergedCond, "or.cond"));
           }
         }
         // Update PHI Node.
@@ -2371,9 +2462,9 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
     // could replace PBI's branch probabilities with BI's.
 
     // Copy any debug value intrinsics into the end of PredBlock.
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
-      if (isa<DbgInfoIntrinsic>(*I))
-        I->clone()->insertBefore(PBI);
+    for (Instruction &I : *BB)
+      if (isa<DbgInfoIntrinsic>(I))
+        I.clone()->insertBefore(PBI);
 
     return true;
   }
@@ -2417,7 +2508,7 @@ static Value *ensureValueAvailableInSuccessor(Value *V, BasicBlock *BB,
   // where OtherBB is the single other predecessor of BB's only successor.
   PHINode *PHI = nullptr;
   BasicBlock *Succ = BB->getSingleSuccessor();
-  
+
   for (auto I = Succ->begin(); isa<PHINode>(I); ++I)
     if (cast<PHINode>(I)->getIncomingValueForBlock(BB) == V) {
       PHI = cast<PHINode>(I);
@@ -2443,8 +2534,8 @@ static Value *ensureValueAvailableInSuccessor(Value *V, BasicBlock *BB,
   PHI->addIncoming(V, BB);
   for (BasicBlock *PredBB : predecessors(Succ))
     if (PredBB != BB)
-      PHI->addIncoming(AlternativeV ? AlternativeV : UndefValue::get(V->getType()),
-                       PredBB);
+      PHI->addIncoming(
+          AlternativeV ? AlternativeV : UndefValue::get(V->getType()), PredBB);
   return PHI;
 }
 
@@ -2481,10 +2572,9 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
     return N <= PHINodeFoldingThreshold;
   };
 
-  if (!MergeCondStoresAggressively && (!IsWorthwhile(PTB) ||
-                                       !IsWorthwhile(PFB) ||
-                                       !IsWorthwhile(QTB) ||
-                                       !IsWorthwhile(QFB)))
+  if (!MergeCondStoresAggressively &&
+      (!IsWorthwhile(PTB) || !IsWorthwhile(PFB) || !IsWorthwhile(QTB) ||
+       !IsWorthwhile(QFB)))
     return false;
 
   // For every pointer, there must be exactly two stores, one coming from
@@ -2561,7 +2651,7 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
 
   QStore->eraseFromParent();
   PStore->eraseFromParent();
-  
+
   return true;
 }
 
@@ -2593,7 +2683,7 @@ static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI) {
   // We model triangles as a type of diamond with a nullptr "true" block.
   // Triangles are canonicalized so that the fallthrough edge is represented by
   // a true condition, as in the diagram above.
-  //  
+  //
   BasicBlock *PTB = PBI->getSuccessor(0);
   BasicBlock *PFB = PBI->getSuccessor(1);
   BasicBlock *QTB = QBI->getSuccessor(0);
@@ -2622,8 +2712,7 @@ static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI) {
   // the post-dominating block, and the non-fallthroughs must only have one
   // predecessor.
   auto HasOnePredAndOneSucc = [](BasicBlock *BB, BasicBlock *P, BasicBlock *S) {
-    return BB->getSinglePredecessor() == P &&
-           BB->getSingleSuccessor() == S;
+    return BB->getSinglePredecessor() == P && BB->getSingleSuccessor() == S;
   };
   if (!PostBB ||
       !HasOnePredAndOneSucc(PFB, PBI->getParent(), QBI->getParent()) ||
@@ -2637,7 +2726,7 @@ static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI) {
 
   // OK, this is a sequence of two diamonds or triangles.
   // Check if there are stores in PTB or PFB that are repeated in QTB or QFB.
-  SmallPtrSet<Value *,4> PStoreAddresses, QStoreAddresses;
+  SmallPtrSet<Value *, 4> PStoreAddresses, QStoreAddresses;
   for (auto *BB : {PTB, PFB}) {
     if (!BB)
       continue;
@@ -2652,7 +2741,7 @@ static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI) {
       if (StoreInst *SI = dyn_cast<StoreInst>(&I))
         QStoreAddresses.insert(SI->getPointerOperand());
   }
-  
+
   set_intersect(PStoreAddresses, QStoreAddresses);
   // set_intersect mutates PStoreAddresses in place. Rename it here to make it
   // clear what it contains.
@@ -2684,9 +2773,9 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
     if (BB->getSinglePredecessor()) {
       // Turn this into a branch on constant.
       bool CondIsTrue = PBI->getSuccessor(0) == BB;
-      BI->setCondition(ConstantInt::get(Type::getInt1Ty(BB->getContext()),
-                                        CondIsTrue));
-      return true;  // Nuke the branch on constant.
+      BI->setCondition(
+          ConstantInt::get(Type::getInt1Ty(BB->getContext()), CondIsTrue));
+      return true; // Nuke the branch on constant.
     }
 
     // Otherwise, if there are multiple predecessors, insert a PHI that merges
@@ -2702,13 +2791,13 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
       // Any predecessor where the condition is not computable we keep symbolic.
       for (pred_iterator PI = PB; PI != PE; ++PI) {
         BasicBlock *P = *PI;
-        if ((PBI = dyn_cast<BranchInst>(P->getTerminator())) &&
-            PBI != BI && PBI->isConditional() &&
-            PBI->getCondition() == BI->getCondition() &&
+        if ((PBI = dyn_cast<BranchInst>(P->getTerminator())) && PBI != BI &&
+            PBI->isConditional() && PBI->getCondition() == BI->getCondition() &&
             PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
           bool CondIsTrue = PBI->getSuccessor(0) == BB;
-          NewPN->addIncoming(ConstantInt::get(Type::getInt1Ty(BB->getContext()),
-                                              CondIsTrue), P);
+          NewPN->addIncoming(
+              ConstantInt::get(Type::getInt1Ty(BB->getContext()), CondIsTrue),
+              P);
         } else {
           NewPN->addIncoming(BI->getCondition(), P);
         }
@@ -2723,19 +2812,6 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
     if (CE->canTrap())
       return false;
 
-  // If BI is reached from the true path of PBI and PBI's condition implies
-  // BI's condition, we know the direction of the BI branch.
-  if (PBI->getSuccessor(0) == BI->getParent() &&
-      isImpliedCondition(PBI->getCondition(), BI->getCondition(), DL) &&
-      PBI->getSuccessor(0) != PBI->getSuccessor(1) &&
-      BB->getSinglePredecessor()) {
-    // Turn this into a branch on constant.
-    auto *OldCond = BI->getCondition();
-    BI->setCondition(ConstantInt::getTrue(BB->getContext()));
-    RecursivelyDeleteTriviallyDeadInstructions(OldCond);
-    return true;  // Nuke the branch on constant.
-  }
-
   // If both branches are conditional and both contain stores to the same
   // address, remove the stores from the conditionals and create a conditional
   // merged store at the end.
@@ -2753,16 +2829,21 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
     return false;
 
   int PBIOp, BIOp;
-  if (PBI->getSuccessor(0) == BI->getSuccessor(0))
-    PBIOp = BIOp = 0;
-  else if (PBI->getSuccessor(0) == BI->getSuccessor(1))
-    PBIOp = 0, BIOp = 1;
-  else if (PBI->getSuccessor(1) == BI->getSuccessor(0))
-    PBIOp = 1, BIOp = 0;
-  else if (PBI->getSuccessor(1) == BI->getSuccessor(1))
-    PBIOp = BIOp = 1;
-  else
+  if (PBI->getSuccessor(0) == BI->getSuccessor(0)) {
+    PBIOp = 0;
+    BIOp = 0;
+  } else if (PBI->getSuccessor(0) == BI->getSuccessor(1)) {
+    PBIOp = 0;
+    BIOp = 1;
+  } else if (PBI->getSuccessor(1) == BI->getSuccessor(0)) {
+    PBIOp = 1;
+    BIOp = 0;
+  } else if (PBI->getSuccessor(1) == BI->getSuccessor(1)) {
+    PBIOp = 1;
+    BIOp = 1;
+  } else {
     return false;
+  }
 
   // Check to make sure that the other destination of this branch
   // isn't BB itself.  If so, this is an infinite loop that will
@@ -2780,8 +2861,8 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
 
   BasicBlock *CommonDest = PBI->getSuccessor(PBIOp);
   unsigned NumPhis = 0;
-  for (BasicBlock::iterator II = CommonDest->begin();
-       isa<PHINode>(II); ++II, ++NumPhis) {
+  for (BasicBlock::iterator II = CommonDest->begin(); isa<PHINode>(II);
+       ++II, ++NumPhis) {
     if (NumPhis > 2) // Disable this xform.
       return false;
 
@@ -2804,7 +2885,6 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
   DEBUG(dbgs() << "FOLDING BRs:" << *PBI->getParent()
                << "AND: " << *BI->getParent());
 
-
   // If OtherDest *is* BB, then BB is a basic block with a single conditional
   // branch in it, where one edge (OtherDest) goes back to itself but the other
   // exits.  We don't *know* that the program avoids the infinite loop
@@ -2815,8 +2895,8 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
   if (OtherDest == BB) {
     // Insert it at the end of the function, because it's either code,
     // or it won't matter if it's hot. :)
-    BasicBlock *InfLoopBlock = BasicBlock::Create(BB->getContext(),
-                                                  "infloop", BB->getParent());
+    BasicBlock *InfLoopBlock =
+        BasicBlock::Create(BB->getContext(), "infloop", BB->getParent());
     BranchInst::Create(InfLoopBlock, InfLoopBlock);
     OtherDest = InfLoopBlock;
   }
@@ -2828,13 +2908,13 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
 
   // Make sure we get to CommonDest on True&True directions.
   Value *PBICond = PBI->getCondition();
-  IRBuilder<true, NoFolder> Builder(PBI);
+  IRBuilder<NoFolder> Builder(PBI);
   if (PBIOp)
-    PBICond = Builder.CreateNot(PBICond, PBICond->getName()+".not");
+    PBICond = Builder.CreateNot(PBICond, PBICond->getName() + ".not");
 
   Value *BICond = BI->getCondition();
   if (BIOp)
-    BICond = Builder.CreateNot(BICond, BICond->getName()+".not");
+    BICond = Builder.CreateNot(BICond, BICond->getName() + ".not");
 
   // Merge the conditions.
   Value *Cond = Builder.CreateOr(PBICond, BICond, "brmerge");
@@ -2846,15 +2926,15 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
 
   // Update branch weight for PBI.
   uint64_t PredTrueWeight, PredFalseWeight, SuccTrueWeight, SuccFalseWeight;
-  bool PredHasWeights = ExtractBranchMetadata(PBI, PredTrueWeight,
-                                              PredFalseWeight);
-  bool SuccHasWeights = ExtractBranchMetadata(BI, SuccTrueWeight,
-                                              SuccFalseWeight);
-  if (PredHasWeights && SuccHasWeights) {
-    uint64_t PredCommon = PBIOp ? PredFalseWeight : PredTrueWeight;
-    uint64_t PredOther = PBIOp ?PredTrueWeight : PredFalseWeight;
-    uint64_t SuccCommon = BIOp ? SuccFalseWeight : SuccTrueWeight;
-    uint64_t SuccOther = BIOp ? SuccTrueWeight : SuccFalseWeight;
+  uint64_t PredCommon, PredOther, SuccCommon, SuccOther;
+  bool HasWeights =
+      extractPredSuccWeights(PBI, BI, PredTrueWeight, PredFalseWeight,
+                             SuccTrueWeight, SuccFalseWeight);
+  if (HasWeights) {
+    PredCommon = PBIOp ? PredFalseWeight : PredTrueWeight;
+    PredOther = PBIOp ? PredTrueWeight : PredFalseWeight;
+    SuccCommon = BIOp ? SuccFalseWeight : SuccTrueWeight;
+    SuccOther = BIOp ? SuccTrueWeight : SuccFalseWeight;
     // The weight to CommonDest should be PredCommon * SuccTotal +
     //                                    PredOther * SuccCommon.
     // The weight to OtherDest should be PredOther * SuccOther.
@@ -2885,9 +2965,29 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
     Value *PBIV = PN->getIncomingValue(PBBIdx);
     if (BIV != PBIV) {
       // Insert a select in PBI to pick the right value.
-      Value *NV = cast<SelectInst>
-        (Builder.CreateSelect(PBICond, PBIV, BIV, PBIV->getName()+".mux"));
+      SelectInst *NV = cast<SelectInst>(
+          Builder.CreateSelect(PBICond, PBIV, BIV, PBIV->getName() + ".mux"));
       PN->setIncomingValue(PBBIdx, NV);
+      // Although the select has the same condition as PBI, the original branch
+      // weights for PBI do not apply to the new select because the select's
+      // 'logical' edges are incoming edges of the phi that is eliminated, not
+      // the outgoing edges of PBI.
+      if (HasWeights) {
+        uint64_t PredCommon = PBIOp ? PredFalseWeight : PredTrueWeight;
+        uint64_t PredOther = PBIOp ? PredTrueWeight : PredFalseWeight;
+        uint64_t SuccCommon = BIOp ? SuccFalseWeight : SuccTrueWeight;
+        uint64_t SuccOther = BIOp ? SuccTrueWeight : SuccFalseWeight;
+        // The weight to PredCommonDest should be PredCommon * SuccTotal.
+        // The weight to PredOtherDest should be PredOther * SuccCommon.
+        uint64_t NewWeights[2] = {PredCommon * (SuccCommon + SuccOther),
+                                  PredOther * SuccCommon};
+
+        FitWeights(NewWeights);
+
+        NV->setMetadata(LLVMContext::MD_prof,
+                        MDBuilder(BI->getContext())
+                            .createBranchWeights(NewWeights[0], NewWeights[1]));
+      }
     }
   }
 
@@ -2907,7 +3007,7 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
 static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond,
                                        BasicBlock *TrueBB, BasicBlock *FalseBB,
                                        uint32_t TrueWeight,
-                                       uint32_t FalseWeight){
+                                       uint32_t FalseWeight) {
   // Remove any superfluous successor edges from the CFG.
   // First, figure out which successors to preserve.
   // If TrueBB and FalseBB are equal, only try to preserve one copy of that
@@ -2942,8 +3042,8 @@ static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond,
       BranchInst *NewBI = Builder.CreateCondBr(Cond, TrueBB, FalseBB);
       if (TrueWeight != FalseWeight)
         NewBI->setMetadata(LLVMContext::MD_prof,
-                           MDBuilder(OldTerm->getContext()).
-                           createBranchWeights(TrueWeight, FalseWeight));
+                           MDBuilder(OldTerm->getContext())
+                               .createBranchWeights(TrueWeight, FalseWeight));
     }
   } else if (KeepEdge1 && (KeepEdge2 || TrueBB == FalseBB)) {
     // Neither of the selected blocks were successors, so this
@@ -2988,16 +3088,16 @@ static bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select) {
   if (HasWeights) {
     GetBranchWeights(SI, Weights);
     if (Weights.size() == 1 + SI->getNumCases()) {
-      TrueWeight = (uint32_t)Weights[SI->findCaseValue(TrueVal).
-                                     getSuccessorIndex()];
-      FalseWeight = (uint32_t)Weights[SI->findCaseValue(FalseVal).
-                                      getSuccessorIndex()];
+      TrueWeight =
+          (uint32_t)Weights[SI->findCaseValue(TrueVal).getSuccessorIndex()];
+      FalseWeight =
+          (uint32_t)Weights[SI->findCaseValue(FalseVal).getSuccessorIndex()];
     }
   }
 
   // Perform the actual simplification.
-  return SimplifyTerminatorOnSelect(SI, Condition, TrueBB, FalseBB,
-                                    TrueWeight, FalseWeight);
+  return SimplifyTerminatorOnSelect(SI, Condition, TrueBB, FalseBB, TrueWeight,
+                                    FalseWeight);
 }
 
 // Replaces
@@ -3017,8 +3117,8 @@ static bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI) {
   BasicBlock *FalseBB = FBA->getBasicBlock();
 
   // Perform the actual simplification.
-  return SimplifyTerminatorOnSelect(IBI, SI->getCondition(), TrueBB, FalseBB,
-                                    0, 0);
+  return SimplifyTerminatorOnSelect(IBI, SI->getCondition(), TrueBB, FalseBB, 0,
+                                    0);
 }
 
 /// This is called when we find an icmp instruction
@@ -3046,7 +3146,8 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(
 
   // If the block has any PHIs in it or the icmp has multiple uses, it is too
   // complex.
-  if (isa<PHINode>(BB->begin()) || !ICI->hasOneUse()) return false;
+  if (isa<PHINode>(BB->begin()) || !ICI->hasOneUse())
+    return false;
 
   Value *V = ICI->getOperand(0);
   ConstantInt *Cst = cast<ConstantInt>(ICI->getOperand(1));
@@ -3055,7 +3156,8 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(
   // 'V' and this block is the default case for the switch.  In this case we can
   // fold the compared value into the switch to simplify things.
   BasicBlock *Pred = BB->getSinglePredecessor();
-  if (!Pred || !isa<SwitchInst>(Pred->getTerminator())) return false;
+  if (!Pred || !isa<SwitchInst>(Pred->getTerminator()))
+    return false;
 
   SwitchInst *SI = cast<SwitchInst>(Pred->getTerminator());
   if (SI->getCondition() != V)
@@ -3104,7 +3206,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(
   // If the icmp is a SETEQ, then the default dest gets false, the new edge gets
   // true in the PHI.
   Constant *DefaultCst = ConstantInt::getTrue(BB->getContext());
-  Constant *NewCst     = ConstantInt::getFalse(BB->getContext());
+  Constant *NewCst = ConstantInt::getFalse(BB->getContext());
 
   if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
     std::swap(DefaultCst, NewCst);
@@ -3116,21 +3218,21 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(
 
   // Okay, the switch goes to this block on a default value.  Add an edge from
   // the switch to the merge point on the compared value.
-  BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "switch.edge",
-                                         BB->getParent(), BB);
+  BasicBlock *NewBB =
+      BasicBlock::Create(BB->getContext(), "switch.edge", BB->getParent(), BB);
   SmallVector<uint64_t, 8> Weights;
   bool HasWeights = HasBranchWeights(SI);
   if (HasWeights) {
     GetBranchWeights(SI, Weights);
     if (Weights.size() == 1 + SI->getNumCases()) {
       // Split weight for default case to case for "Cst".
-      Weights[0] = (Weights[0]+1) >> 1;
+      Weights[0] = (Weights[0] + 1) >> 1;
       Weights.push_back(Weights[0]);
 
       SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
-      SI->setMetadata(LLVMContext::MD_prof,
-                      MDBuilder(SI->getContext()).
-                      createBranchWeights(MDWeights));
+      SI->setMetadata(
+          LLVMContext::MD_prof,
+          MDBuilder(SI->getContext()).createBranchWeights(MDWeights));
     }
   }
   SI->addCase(Cst, NewBB);
@@ -3149,7 +3251,8 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(
 static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
                                       const DataLayout &DL) {
   Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
-  if (!Cond) return false;
+  if (!Cond)
+    return false;
 
   // Change br (X == 0 | X == 1), T, F into a switch instruction.
   // If this is a bunch of seteq's or'd together, or if it's a bunch of
@@ -3158,13 +3261,14 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
   // Try to gather values from a chain of and/or to be turned into a switch
   ConstantComparesGatherer ConstantCompare(Cond, DL);
   // Unpack the result
-  SmallVectorImpl<ConstantInt*> &Values = ConstantCompare.Vals;
+  SmallVectorImpl<ConstantInt *> &Values = ConstantCompare.Vals;
   Value *CompVal = ConstantCompare.CompValue;
   unsigned UsedICmps = ConstantCompare.UsedICmps;
   Value *ExtraCase = ConstantCompare.Extra;
 
   // If we didn't have a multiply compared value, fail.
-  if (!CompVal) return false;
+  if (!CompVal)
+    return false;
 
   // Avoid turning single icmps into a switch.
   if (UsedICmps <= 1)
@@ -3179,20 +3283,23 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
 
   // If Extra was used, we require at least two switch values to do the
   // transformation.  A switch with one value is just a conditional branch.
-  if (ExtraCase && Values.size() < 2) return false;
+  if (ExtraCase && Values.size() < 2)
+    return false;
 
   // TODO: Preserve branch weight metadata, similarly to how
   // FoldValueComparisonIntoPredecessors preserves it.
 
   // Figure out which block is which destination.
   BasicBlock *DefaultBB = BI->getSuccessor(1);
-  BasicBlock *EdgeBB    = BI->getSuccessor(0);
-  if (!TrueWhenEqual) std::swap(DefaultBB, EdgeBB);
+  BasicBlock *EdgeBB = BI->getSuccessor(0);
+  if (!TrueWhenEqual)
+    std::swap(DefaultBB, EdgeBB);
 
   BasicBlock *BB = BI->getParent();
 
   DEBUG(dbgs() << "Converting 'icmp' chain with " << Values.size()
-               << " cases into SWITCH.  BB is:\n" << *BB);
+               << " cases into SWITCH.  BB is:\n"
+               << *BB);
 
   // If there are any extra values that couldn't be folded into the switch
   // then we evaluate them with an explicit branch first.  Split the block
@@ -3216,7 +3323,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
     AddPredecessorToBlock(EdgeBB, BB, NewBB);
 
     DEBUG(dbgs() << "  ** 'icmp' chain unhandled condition: " << *ExtraCase
-          << "\nEXTRABB = " << *BB);
+                 << "\nEXTRABB = " << *BB);
     BB = NewBB;
   }
 
@@ -3237,11 +3344,10 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
   // We added edges from PI to the EdgeBB.  As such, if there were any
   // PHI nodes in EdgeBB, they need entries to be added corresponding to
   // the number of edges added.
-  for (BasicBlock::iterator BBI = EdgeBB->begin();
-       isa<PHINode>(BBI); ++BBI) {
+  for (BasicBlock::iterator BBI = EdgeBB->begin(); isa<PHINode>(BBI); ++BBI) {
     PHINode *PN = cast<PHINode>(BBI);
     Value *InVal = PN->getIncomingValueForBlock(BB);
-    for (unsigned i = 0, e = Values.size()-1; i != e; ++i)
+    for (unsigned i = 0, e = Values.size() - 1; i != e; ++i)
       PN->addIncoming(InVal, BB);
   }
 
@@ -3270,7 +3376,7 @@ bool SimplifyCFGOpt::SimplifyCommonResume(ResumeInst *RI) {
   // Check that there are no other instructions except for debug intrinsics
   // between the phi of landing pads (RI->getValue()) and resume instruction.
   BasicBlock::iterator I = cast<Instruction>(RI->getValue())->getIterator(),
-		  	  	  	   E = RI->getIterator();
+                       E = RI->getIterator();
   while (++I != E)
     if (!isa<DbgInfoIntrinsic>(I))
       return false;
@@ -3279,8 +3385,8 @@ bool SimplifyCFGOpt::SimplifyCommonResume(ResumeInst *RI) {
   auto *PhiLPInst = cast<PHINode>(RI->getValue());
 
   // Check incoming blocks to see if any of them are trivial.
-  for (unsigned Idx = 0, End = PhiLPInst->getNumIncomingValues();
-       Idx != End; Idx++) {
+  for (unsigned Idx = 0, End = PhiLPInst->getNumIncomingValues(); Idx != End;
+       Idx++) {
     auto *IncomingBB = PhiLPInst->getIncomingBlock(Idx);
     auto *IncomingValue = PhiLPInst->getIncomingValue(Idx);
 
@@ -3289,8 +3395,7 @@ bool SimplifyCFGOpt::SimplifyCommonResume(ResumeInst *RI) {
     if (IncomingBB->getUniqueSuccessor() != BB)
       continue;
 
-    auto *LandingPad =
-        dyn_cast<LandingPadInst>(IncomingBB->getFirstNonPHI());
+    auto *LandingPad = dyn_cast<LandingPadInst>(IncomingBB->getFirstNonPHI());
     // Not the landing pad that caused the control to branch here.
     if (IncomingValue != LandingPad)
       continue;
@@ -3310,7 +3415,8 @@ bool SimplifyCFGOpt::SimplifyCommonResume(ResumeInst *RI) {
   }
 
   // If no trivial unwind blocks, don't do any simplifications.
-  if (TrivialUnwindBlocks.empty()) return false;
+  if (TrivialUnwindBlocks.empty())
+    return false;
 
   // Turn all invokes that unwind here into calls.
   for (auto *TrivialBB : TrivialUnwindBlocks) {
@@ -3346,8 +3452,8 @@ bool SimplifyCFGOpt::SimplifyCommonResume(ResumeInst *RI) {
 bool SimplifyCFGOpt::SimplifySingleResume(ResumeInst *RI) {
   BasicBlock *BB = RI->getParent();
   LandingPadInst *LPInst = dyn_cast<LandingPadInst>(BB->getFirstNonPHI());
-  assert (RI->getValue() == LPInst &&
-          "Resume must unwind the exception that caused control to here");
+  assert(RI->getValue() == LPInst &&
+         "Resume must unwind the exception that caused control to here");
 
   // Check that there are no other instructions except for debug intrinsics.
   BasicBlock::iterator I = LPInst->getIterator(), E = RI->getIterator();
@@ -3363,10 +3469,12 @@ bool SimplifyCFGOpt::SimplifySingleResume(ResumeInst *RI) {
 
   // The landingpad is now unreachable.  Zap it.
   BB->eraseFromParent();
+  if (LoopHeaders)
+    LoopHeaders->erase(BB);
   return true;
 }
 
-bool SimplifyCFGOpt::SimplifyCleanupReturn(CleanupReturnInst *RI) {
+static bool removeEmptyCleanup(CleanupReturnInst *RI) {
   // If this is a trivial cleanup pad that executes no instructions, it can be
   // eliminated.  If the cleanup pad continues to the caller, any predecessor
   // that is an EH pad will be updated to continue to the caller and any
@@ -3381,12 +3489,29 @@ bool SimplifyCFGOpt::SimplifyCleanupReturn(CleanupReturnInst *RI) {
     // This isn't an empty cleanup.
     return false;
 
-  // Check that there are no other instructions except for debug intrinsics.
+  // We cannot kill the pad if it has multiple uses.  This typically arises
+  // from unreachable basic blocks.
+  if (!CPInst->hasOneUse())
+    return false;
+
+  // Check that there are no other instructions except for benign intrinsics.
   BasicBlock::iterator I = CPInst->getIterator(), E = RI->getIterator();
-  while (++I != E)
-    if (!isa<DbgInfoIntrinsic>(I))
+  while (++I != E) {
+    auto *II = dyn_cast<IntrinsicInst>(I);
+    if (!II)
       return false;
 
+    Intrinsic::ID IntrinsicID = II->getIntrinsicID();
+    switch (IntrinsicID) {
+    case Intrinsic::dbg_declare:
+    case Intrinsic::dbg_value:
+    case Intrinsic::lifetime_end:
+      break;
+    default:
+      return false;
+    }
+  }
+
   // If the cleanup return we are simplifying unwinds to the caller, this will
   // set UnwindDest to nullptr.
   BasicBlock *UnwindDest = RI->getUnwindDest();
@@ -3430,7 +3555,7 @@ bool SimplifyCFGOpt::SimplifyCleanupReturn(CleanupReturnInst *RI) {
         // removing, we need to merge that PHI node's incoming values into
         // DestPN.
         for (unsigned SrcIdx = 0, SrcE = SrcPN->getNumIncomingValues();
-              SrcIdx != SrcE; ++SrcIdx) {
+             SrcIdx != SrcE; ++SrcIdx) {
           DestPN->addIncoming(SrcPN->getIncomingValue(SrcIdx),
                               SrcPN->getIncomingBlock(SrcIdx));
         }
@@ -3484,13 +3609,63 @@ bool SimplifyCFGOpt::SimplifyCleanupReturn(CleanupReturnInst *RI) {
   return true;
 }
 
+// Try to merge two cleanuppads together.
+static bool mergeCleanupPad(CleanupReturnInst *RI) {
+  // Skip any cleanuprets which unwind to caller, there is nothing to merge
+  // with.
+  BasicBlock *UnwindDest = RI->getUnwindDest();
+  if (!UnwindDest)
+    return false;
+
+  // This cleanupret isn't the only predecessor of this cleanuppad, it wouldn't
+  // be safe to merge without code duplication.
+  if (UnwindDest->getSinglePredecessor() != RI->getParent())
+    return false;
+
+  // Verify that our cleanuppad's unwind destination is another cleanuppad.
+  auto *SuccessorCleanupPad = dyn_cast<CleanupPadInst>(&UnwindDest->front());
+  if (!SuccessorCleanupPad)
+    return false;
+
+  CleanupPadInst *PredecessorCleanupPad = RI->getCleanupPad();
+  // Replace any uses of the successor cleanupad with the predecessor pad
+  // The only cleanuppad uses should be this cleanupret, it's cleanupret and
+  // funclet bundle operands.
+  SuccessorCleanupPad->replaceAllUsesWith(PredecessorCleanupPad);
+  // Remove the old cleanuppad.
+  SuccessorCleanupPad->eraseFromParent();
+  // Now, we simply replace the cleanupret with a branch to the unwind
+  // destination.
+  BranchInst::Create(UnwindDest, RI->getParent());
+  RI->eraseFromParent();
+
+  return true;
+}
+
+bool SimplifyCFGOpt::SimplifyCleanupReturn(CleanupReturnInst *RI) {
+  // It is possible to transiantly have an undef cleanuppad operand because we
+  // have deleted some, but not all, dead blocks.
+  // Eventually, this block will be deleted.
+  if (isa<UndefValue>(RI->getOperand(0)))
+    return false;
+
+  if (mergeCleanupPad(RI))
+    return true;
+
+  if (removeEmptyCleanup(RI))
+    return true;
+
+  return false;
+}
+
 bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
   BasicBlock *BB = RI->getParent();
-  if (!BB->getFirstNonPHIOrDbg()->isTerminator()) return false;
+  if (!BB->getFirstNonPHIOrDbg()->isTerminator())
+    return false;
 
   // Find predecessors that end with branches.
-  SmallVector<BasicBlock*, 8> UncondBranchPreds;
-  SmallVector<BranchInst*, 8> CondBranchPreds;
+  SmallVector<BasicBlock *, 8> UncondBranchPreds;
+  SmallVector<BranchInst *, 8> CondBranchPreds;
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
     BasicBlock *P = *PI;
     TerminatorInst *PTI = P->getTerminator();
@@ -3507,14 +3682,17 @@ bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
     while (!UncondBranchPreds.empty()) {
       BasicBlock *Pred = UncondBranchPreds.pop_back_val();
       DEBUG(dbgs() << "FOLDING: " << *BB
-            << "INTO UNCOND BRANCH PRED: " << *Pred);
+                   << "INTO UNCOND BRANCH PRED: " << *Pred);
       (void)FoldReturnIntoUncondBranch(RI, BB, Pred);
     }
 
     // If we eliminated all predecessors of the block, delete the block now.
-    if (pred_empty(BB))
+    if (pred_empty(BB)) {
       // We know there are no successors, so just nuke the block.
       BB->eraseFromParent();
+      if (LoopHeaders)
+        LoopHeaders->erase(BB);
+    }
 
     return true;
   }
@@ -3547,7 +3725,8 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
     // Do not delete instructions that can have side effects which might cause
     // the unreachable to not be reachable; specifically, calls and volatile
     // operations may have this effect.
-    if (isa<CallInst>(BBI) && !isa<DbgInfoIntrinsic>(BBI)) break;
+    if (isa<CallInst>(BBI) && !isa<DbgInfoIntrinsic>(BBI))
+      break;
 
     if (BBI->mayHaveSideEffects()) {
       if (auto *SI = dyn_cast<StoreInst>(BBI)) {
@@ -3589,9 +3768,10 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
 
   // If the unreachable instruction is the first in the block, take a gander
   // at all of the predecessors of this instruction, and simplify them.
-  if (&BB->front() != UI) return Changed;
+  if (&BB->front() != UI)
+    return Changed;
 
-  SmallVector<BasicBlock*, 8> Preds(pred_begin(BB), pred_end(BB));
+  SmallVector<BasicBlock *, 8> Preds(pred_begin(BB), pred_end(BB));
   for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
     TerminatorInst *TI = Preds[i]->getTerminator();
     IRBuilder<> Builder(TI);
@@ -3613,12 +3793,13 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
         }
       }
     } else if (auto *SI = dyn_cast<SwitchInst>(TI)) {
-      for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
-           i != e; ++i)
+      for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e;
+           ++i)
         if (i.getCaseSuccessor() == BB) {
           BB->removePredecessor(SI->getParent());
           SI->removeCase(i);
-          --i; --e;
+          --i;
+          --e;
           Changed = true;
         }
     } else if (auto *II = dyn_cast<InvokeInst>(TI)) {
@@ -3667,10 +3848,11 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
   }
 
   // If this block is now dead, remove it.
-  if (pred_empty(BB) &&
-      BB != &BB->getParent()->getEntryBlock()) {
+  if (pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) {
     // We know there are no successors, so just nuke the block.
     BB->eraseFromParent();
+    if (LoopHeaders)
+      LoopHeaders->erase(BB);
     return true;
   }
 
@@ -3699,25 +3881,28 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
   // Partition the cases into two sets with different destinations.
   BasicBlock *DestA = HasDefault ? SI->getDefaultDest() : nullptr;
   BasicBlock *DestB = nullptr;
-  SmallVector <ConstantInt *, 16> CasesA;
-  SmallVector <ConstantInt *, 16> CasesB;
+  SmallVector<ConstantInt *, 16> CasesA;
+  SmallVector<ConstantInt *, 16> CasesB;
 
   for (SwitchInst::CaseIt I : SI->cases()) {
     BasicBlock *Dest = I.getCaseSuccessor();
-    if (!DestA) DestA = Dest;
+    if (!DestA)
+      DestA = Dest;
     if (Dest == DestA) {
       CasesA.push_back(I.getCaseValue());
       continue;
     }
-    if (!DestB) DestB = Dest;
+    if (!DestB)
+      DestB = Dest;
     if (Dest == DestB) {
       CasesB.push_back(I.getCaseValue());
       continue;
     }
-    return false;  // More than two destinations.
+    return false; // More than two destinations.
   }
 
-  assert(DestA && DestB && "Single-destination switch should have been folded.");
+  assert(DestA && DestB &&
+         "Single-destination switch should have been folded.");
   assert(DestA != DestB);
   assert(DestB != SI->getDefaultDest());
   assert(!CasesB.empty() && "There must be non-default cases.");
@@ -3741,7 +3926,8 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
   // Start building the compare and branch.
 
   Constant *Offset = ConstantExpr::getNeg(ContiguousCases->back());
-  Constant *NumCases = ConstantInt::get(Offset->getType(), ContiguousCases->size());
+  Constant *NumCases =
+      ConstantInt::get(Offset->getType(), ContiguousCases->size());
 
   Value *Sub = SI->getCondition();
   if (!Offset->isNullValue())
@@ -3773,21 +3959,24 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
         FalseWeight /= 2;
       }
       NewBI->setMetadata(LLVMContext::MD_prof,
-                         MDBuilder(SI->getContext()).createBranchWeights(
-                             (uint32_t)TrueWeight, (uint32_t)FalseWeight));
+                         MDBuilder(SI->getContext())
+                             .createBranchWeights((uint32_t)TrueWeight,
+                                                  (uint32_t)FalseWeight));
     }
   }
 
   // Prune obsolete incoming values off the successors' PHI nodes.
   for (auto BBI = ContiguousDest->begin(); isa<PHINode>(BBI); ++BBI) {
     unsigned PreviousEdges = ContiguousCases->size();
-    if (ContiguousDest == SI->getDefaultDest()) ++PreviousEdges;
+    if (ContiguousDest == SI->getDefaultDest())
+      ++PreviousEdges;
     for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I)
       cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
   }
   for (auto BBI = OtherDest->begin(); isa<PHINode>(BBI); ++BBI) {
     unsigned PreviousEdges = SI->getNumCases() - ContiguousCases->size();
-    if (OtherDest == SI->getDefaultDest()) ++PreviousEdges;
+    if (OtherDest == SI->getDefaultDest())
+      ++PreviousEdges;
     for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I)
       cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
   }
@@ -3807,32 +3996,38 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
   APInt KnownZero(Bits, 0), KnownOne(Bits, 0);
   computeKnownBits(Cond, KnownZero, KnownOne, DL, 0, AC, SI);
 
+  // We can also eliminate cases by determining that their values are outside of
+  // the limited range of the condition based on how many significant (non-sign)
+  // bits are in the condition value.
+  unsigned ExtraSignBits = ComputeNumSignBits(Cond, DL, 0, AC, SI) - 1;
+  unsigned MaxSignificantBitsInCond = Bits - ExtraSignBits;
+
   // Gather dead cases.
-  SmallVector<ConstantInt*, 8> DeadCases;
-  for (SwitchInst::CaseIt I = SI->case_begin(), E = SI->case_end(); I != E; ++I) {
-    if ((I.getCaseValue()->getValue() & KnownZero) != 0 ||
-        (I.getCaseValue()->getValue() & KnownOne) != KnownOne) {
-      DeadCases.push_back(I.getCaseValue());
-      DEBUG(dbgs() << "SimplifyCFG: switch case '"
-                   << I.getCaseValue() << "' is dead.\n");
+  SmallVector<ConstantInt *, 8> DeadCases;
+  for (auto &Case : SI->cases()) {
+    APInt CaseVal = Case.getCaseValue()->getValue();
+    if ((CaseVal & KnownZero) != 0 || (CaseVal & KnownOne) != KnownOne ||
+        (CaseVal.getMinSignedBits() > MaxSignificantBitsInCond)) {
+      DeadCases.push_back(Case.getCaseValue());
+      DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal << " is dead.\n");
     }
   }
 
-  // If we can prove that the cases must cover all possible values, the 
-  // default destination becomes dead and we can remove it.  If we know some 
+  // If we can prove that the cases must cover all possible values, the
+  // default destination becomes dead and we can remove it.  If we know some
   // of the bits in the value, we can use that to more precisely compute the
   // number of possible unique case values.
   bool HasDefault =
-    !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
-  const unsigned NumUnknownBits = Bits - 
-    (KnownZero.Or(KnownOne)).countPopulation();
+      !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
+  const unsigned NumUnknownBits =
+      Bits - (KnownZero.Or(KnownOne)).countPopulation();
   assert(NumUnknownBits <= Bits);
   if (HasDefault && DeadCases.empty() &&
-      NumUnknownBits < 64 /* avoid overflow */ &&  
+      NumUnknownBits < 64 /* avoid overflow */ &&
       SI->getNumCases() == (1ULL << NumUnknownBits)) {
     DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
-    BasicBlock *NewDefault = SplitBlockPredecessors(SI->getDefaultDest(),
-                                                    SI->getParent(), "");
+    BasicBlock *NewDefault =
+        SplitBlockPredecessors(SI->getDefaultDest(), SI->getParent(), "");
     SI->setDefaultDest(&*NewDefault);
     SplitBlock(&*NewDefault, &NewDefault->front());
     auto *OldTI = NewDefault->getTerminator();
@@ -3849,12 +4044,12 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
   }
 
   // Remove dead cases from the switch.
-  for (unsigned I = 0, E = DeadCases.size(); I != E; ++I) {
-    SwitchInst::CaseIt Case = SI->findCaseValue(DeadCases[I]);
+  for (ConstantInt *DeadCase : DeadCases) {
+    SwitchInst::CaseIt Case = SI->findCaseValue(DeadCase);
     assert(Case != SI->case_default() &&
            "Case was not found. Probably mistake in DeadCases forming.");
     if (HasWeight) {
-      std::swap(Weights[Case.getCaseIndex()+1], Weights.back());
+      std::swap(Weights[Case.getCaseIndex() + 1], Weights.back());
       Weights.pop_back();
     }
 
@@ -3865,8 +4060,8 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
   if (HasWeight && Weights.size() >= 2) {
     SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
     SI->setMetadata(LLVMContext::MD_prof,
-                    MDBuilder(SI->getParent()->getContext()).
-                    createBranchWeights(MDWeights));
+                    MDBuilder(SI->getParent()->getContext())
+                        .createBranchWeights(MDWeights));
   }
 
   return !DeadCases.empty();
@@ -3878,8 +4073,7 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
 /// block and see if the incoming value is equal to CaseValue. If so, return
 /// the phi node, and set PhiIndex to BB's index in the phi node.
 static PHINode *FindPHIForConditionForwarding(ConstantInt *CaseValue,
-                                              BasicBlock *BB,
-                                              int *PhiIndex) {
+                                              BasicBlock *BB, int *PhiIndex) {
   if (BB->getFirstNonPHIOrDbg() != BB->getTerminator())
     return nullptr; // BB must be empty to be a candidate for simplification.
   if (!BB->getSinglePredecessor())
@@ -3897,7 +4091,8 @@ static PHINode *FindPHIForConditionForwarding(ConstantInt *CaseValue,
     assert(Idx >= 0 && "PHI has no entry for predecessor?");
 
     Value *InValue = PHI->getIncomingValue(Idx);
-    if (InValue != CaseValue) continue;
+    if (InValue != CaseValue)
+      continue;
 
     *PhiIndex = Idx;
     return PHI;
@@ -3911,17 +4106,19 @@ static PHINode *FindPHIForConditionForwarding(ConstantInt *CaseValue,
 /// blocks of the switch can be folded away.
 /// Returns true if a change is made.
 static bool ForwardSwitchConditionToPHI(SwitchInst *SI) {
-  typedef DenseMap<PHINode*, SmallVector<int,4> > ForwardingNodesMap;
+  typedef DenseMap<PHINode *, SmallVector<int, 4>> ForwardingNodesMap;
   ForwardingNodesMap ForwardingNodes;
 
-  for (SwitchInst::CaseIt I = SI->case_begin(), E = SI->case_end(); I != E; ++I) {
+  for (SwitchInst::CaseIt I = SI->case_begin(), E = SI->case_end(); I != E;
+       ++I) {
     ConstantInt *CaseValue = I.getCaseValue();
     BasicBlock *CaseDest = I.getCaseSuccessor();
 
     int PhiIndex;
-    PHINode *PHI = FindPHIForConditionForwarding(CaseValue, CaseDest,
-                                                 &PhiIndex);
-    if (!PHI) continue;
+    PHINode *PHI =
+        FindPHIForConditionForwarding(CaseValue, CaseDest, &PhiIndex);
+    if (!PHI)
+      continue;
 
     ForwardingNodes[PHI].push_back(PhiIndex);
   }
@@ -3929,11 +4126,13 @@ static bool ForwardSwitchConditionToPHI(SwitchInst *SI) {
   bool Changed = false;
 
   for (ForwardingNodesMap::iterator I = ForwardingNodes.begin(),
-       E = ForwardingNodes.end(); I != E; ++I) {
+                                    E = ForwardingNodes.end();
+       I != E; ++I) {
     PHINode *Phi = I->first;
     SmallVectorImpl<int> &Indexes = I->second;
 
-    if (Indexes.size() < 2) continue;
+    if (Indexes.size() < 2)
+      continue;
 
     for (size_t I = 0, E = Indexes.size(); I != E; ++I)
       Phi->setIncomingValue(Indexes[I], SI->getCondition());
@@ -3954,17 +4153,16 @@ static bool ValidLookupTableConstant(Constant *C) {
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
     return CE->isGEPWithNoNotionalOverIndexing();
 
-  return isa<ConstantFP>(C) ||
-      isa<ConstantInt>(C) ||
-      isa<ConstantPointerNull>(C) ||
-      isa<GlobalValue>(C) ||
-      isa<UndefValue>(C);
+  return isa<ConstantFP>(C) || isa<ConstantInt>(C) ||
+         isa<ConstantPointerNull>(C) || isa<GlobalValue>(C) ||
+         isa<UndefValue>(C);
 }
 
 /// If V is a Constant, return it. Otherwise, try to look up
 /// its constant value in ConstantPool, returning 0 if it's not there.
-static Constant *LookupConstant(Value *V,
-                         const SmallDenseMap<Value*, Constant*>& ConstantPool) {
+static Constant *
+LookupConstant(Value *V,
+               const SmallDenseMap<Value *, Constant *> &ConstantPool) {
   if (Constant *C = dyn_cast<Constant>(V))
     return C;
   return ConstantPool.lookup(V);
@@ -4001,7 +4199,7 @@ ConstantFold(Instruction *I, const DataLayout &DL,
                                            COps[1], DL);
   }
 
-  return ConstantFoldInstOperands(I->getOpcode(), I->getType(), COps, DL);
+  return ConstantFoldInstOperands(I, COps, DL);
 }
 
 /// Try to determine the resulting constant values in phi nodes
@@ -4018,7 +4216,7 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
 
   // If CaseDest is empty except for some side-effect free instructions through
   // which we can constant-propagate the CaseVal, continue to its successor.
-  SmallDenseMap<Value*, Constant*> ConstantPool;
+  SmallDenseMap<Value *, Constant *> ConstantPool;
   ConstantPool.insert(std::make_pair(SI->getCondition(), CaseVal));
   for (BasicBlock::iterator I = CaseDest->begin(), E = CaseDest->end(); I != E;
        ++I) {
@@ -4068,8 +4266,8 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
     if (Idx == -1)
       continue;
 
-    Constant *ConstVal = LookupConstant(PHI->getIncomingValue(Idx),
-                                        ConstantPool);
+    Constant *ConstVal =
+        LookupConstant(PHI->getIncomingValue(Idx), ConstantPool);
     if (!ConstVal)
       return false;
 
@@ -4086,16 +4284,16 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
 // Helper function used to add CaseVal to the list of cases that generate
 // Result.
 static void MapCaseToResult(ConstantInt *CaseVal,
-    SwitchCaseResultVectorTy &UniqueResults,
-    Constant *Result) {
+                            SwitchCaseResultVectorTy &UniqueResults,
+                            Constant *Result) {
   for (auto &I : UniqueResults) {
     if (I.first == Result) {
       I.second.push_back(CaseVal);
       return;
     }
   }
-  UniqueResults.push_back(std::make_pair(Result,
-        SmallVector<ConstantInt*, 4>(1, CaseVal)));
+  UniqueResults.push_back(
+      std::make_pair(Result, SmallVector<ConstantInt *, 4>(1, CaseVal)));
 }
 
 // Helper function that initializes a map containing
@@ -4137,7 +4335,7 @@ static bool InitializeUniqueCases(SwitchInst *SI, PHINode *&PHI,
   DefaultResult =
       DefaultResults.size() == 1 ? DefaultResults.begin()->second : nullptr;
   if ((!DefaultResult &&
-        !isa<UnreachableInst>(DefaultDest->getFirstNonPHIOrDbg())))
+       !isa<UnreachableInst>(DefaultDest->getFirstNonPHIOrDbg())))
     return false;
 
   return true;
@@ -4154,12 +4352,11 @@ static bool InitializeUniqueCases(SwitchInst *SI, PHINode *&PHI,
 //   default:
 //     return 4;
 // }
-static Value *
-ConvertTwoCaseSwitch(const SwitchCaseResultVectorTy &ResultVector,
-                     Constant *DefaultResult, Value *Condition,
-                     IRBuilder<> &Builder) {
+static Value *ConvertTwoCaseSwitch(const SwitchCaseResultVectorTy &ResultVector,
+                                   Constant *DefaultResult, Value *Condition,
+                                   IRBuilder<> &Builder) {
   assert(ResultVector.size() == 2 &&
-      "We should have exactly two unique results at this point");
+         "We should have exactly two unique results at this point");
   // If we are selecting between only two cases transform into a simple
   // select or a two-way select if default is possible.
   if (ResultVector[0].second.size() == 1 &&
@@ -4177,8 +4374,8 @@ ConvertTwoCaseSwitch(const SwitchCaseResultVectorTy &ResultVector,
     }
     Value *const ValueCompare =
         Builder.CreateICmpEQ(Condition, FirstCase, "switch.selectcmp");
-    return Builder.CreateSelect(ValueCompare, ResultVector[0].first, SelectValue,
-                                "switch.select");
+    return Builder.CreateSelect(ValueCompare, ResultVector[0].first,
+                                SelectValue, "switch.select");
   }
 
   return nullptr;
@@ -4227,9 +4424,8 @@ static bool SwitchToSelect(SwitchInst *SI, IRBuilder<> &Builder,
   assert(PHI != nullptr && "PHI for value select not found");
 
   Builder.SetInsertPoint(SI);
-  Value *SelectValue = ConvertTwoCaseSwitch(
-      UniqueResults,
-      DefaultResult, Cond, Builder);
+  Value *SelectValue =
+      ConvertTwoCaseSwitch(UniqueResults, DefaultResult, Cond, Builder);
   if (SelectValue) {
     RemoveSwitchAfterSelectConversion(SI, PHI, SelectValue, Builder);
     return true;
@@ -4239,62 +4435,62 @@ static bool SwitchToSelect(SwitchInst *SI, IRBuilder<> &Builder,
 }
 
 namespace {
-  /// This class represents a lookup table that can be used to replace a switch.
-  class SwitchLookupTable {
-  public:
-    /// Create a lookup table to use as a switch replacement with the contents
-    /// of Values, using DefaultValue to fill any holes in the table.
-    SwitchLookupTable(
-        Module &M, uint64_t TableSize, ConstantInt *Offset,
-        const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values,
-        Constant *DefaultValue, const DataLayout &DL);
-
-    /// Build instructions with Builder to retrieve the value at
-    /// the position given by Index in the lookup table.
-    Value *BuildLookup(Value *Index, IRBuilder<> &Builder);
-
-    /// Return true if a table with TableSize elements of
-    /// type ElementType would fit in a target-legal register.
-    static bool WouldFitInRegister(const DataLayout &DL, uint64_t TableSize,
-                                   Type *ElementType);
-
-  private:
-    // Depending on the contents of the table, it can be represented in
-    // different ways.
-    enum {
-      // For tables where each element contains the same value, we just have to
-      // store that single value and return it for each lookup.
-      SingleValueKind,
-
-      // For tables where there is a linear relationship between table index
-      // and values. We calculate the result with a simple multiplication
-      // and addition instead of a table lookup.
-      LinearMapKind,
-
-      // For small tables with integer elements, we can pack them into a bitmap
-      // that fits into a target-legal register. Values are retrieved by
-      // shift and mask operations.
-      BitMapKind,
-
-      // The table is stored as an array of values. Values are retrieved by load
-      // instructions from the table.
-      ArrayKind
-    } Kind;
-
-    // For SingleValueKind, this is the single value.
-    Constant *SingleValue;
-
-    // For BitMapKind, this is the bitmap.
-    ConstantInt *BitMap;
-    IntegerType *BitMapElementTy;
-
-    // For LinearMapKind, these are the constants used to derive the value.
-    ConstantInt *LinearOffset;
-    ConstantInt *LinearMultiplier;
-
-    // For ArrayKind, this is the array.
-    GlobalVariable *Array;
-  };
+/// This class represents a lookup table that can be used to replace a switch.
+class SwitchLookupTable {
+public:
+  /// Create a lookup table to use as a switch replacement with the contents
+  /// of Values, using DefaultValue to fill any holes in the table.
+  SwitchLookupTable(
+      Module &M, uint64_t TableSize, ConstantInt *Offset,
+      const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values,
+      Constant *DefaultValue, const DataLayout &DL);
+
+  /// Build instructions with Builder to retrieve the value at
+  /// the position given by Index in the lookup table.
+  Value *BuildLookup(Value *Index, IRBuilder<> &Builder);
+
+  /// Return true if a table with TableSize elements of
+  /// type ElementType would fit in a target-legal register.
+  static bool WouldFitInRegister(const DataLayout &DL, uint64_t TableSize,
+                                 Type *ElementType);
+
+private:
+  // Depending on the contents of the table, it can be represented in
+  // different ways.
+  enum {
+    // For tables where each element contains the same value, we just have to
+    // store that single value and return it for each lookup.
+    SingleValueKind,
+
+    // For tables where there is a linear relationship between table index
+    // and values. We calculate the result with a simple multiplication
+    // and addition instead of a table lookup.
+    LinearMapKind,
+
+    // For small tables with integer elements, we can pack them into a bitmap
+    // that fits into a target-legal register. Values are retrieved by
+    // shift and mask operations.
+    BitMapKind,
+
+    // The table is stored as an array of values. Values are retrieved by load
+    // instructions from the table.
+    ArrayKind
+  } Kind;
+
+  // For SingleValueKind, this is the single value.
+  Constant *SingleValue;
+
+  // For BitMapKind, this is the bitmap.
+  ConstantInt *BitMap;
+  IntegerType *BitMapElementTy;
+
+  // For LinearMapKind, these are the constants used to derive the value.
+  ConstantInt *LinearOffset;
+  ConstantInt *LinearMultiplier;
+
+  // For ArrayKind, this is the array.
+  GlobalVariable *Array;
+};
 }
 
 SwitchLookupTable::SwitchLookupTable(
@@ -4312,14 +4508,13 @@ SwitchLookupTable::SwitchLookupTable(
   Type *ValueType = Values.begin()->second->getType();
 
   // Build up the table contents.
-  SmallVector<Constant*, 64> TableContents(TableSize);
+  SmallVector<Constant *, 64> TableContents(TableSize);
   for (size_t I = 0, E = Values.size(); I != E; ++I) {
     ConstantInt *CaseVal = Values[I].first;
     Constant *CaseRes = Values[I].second;
     assert(CaseRes->getType() == ValueType);
 
-    uint64_t Idx = (CaseVal->getValue() - Offset->getValue())
-                   .getLimitedValue();
+    uint64_t Idx = (CaseVal->getValue() - Offset->getValue()).getLimitedValue();
     TableContents[Idx] = CaseRes;
 
     if (CaseRes != SingleValue)
@@ -4407,65 +4602,62 @@ SwitchLookupTable::SwitchLookupTable(
   ArrayType *ArrayTy = ArrayType::get(ValueType, TableSize);
   Constant *Initializer = ConstantArray::get(ArrayTy, TableContents);
 
-  Array = new GlobalVariable(M, ArrayTy, /*constant=*/ true,
-                             GlobalVariable::PrivateLinkage,
-                             Initializer,
+  Array = new GlobalVariable(M, ArrayTy, /*constant=*/true,
+                             GlobalVariable::PrivateLinkage, Initializer,
                              "switch.table");
-  Array->setUnnamedAddr(true);
+  Array->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
   Kind = ArrayKind;
 }
 
 Value *SwitchLookupTable::BuildLookup(Value *Index, IRBuilder<> &Builder) {
   switch (Kind) {
-    case SingleValueKind:
-      return SingleValue;
-    case LinearMapKind: {
-      // Derive the result value from the input value.
-      Value *Result = Builder.CreateIntCast(Index, LinearMultiplier->getType(),
-                                            false, "switch.idx.cast");
-      if (!LinearMultiplier->isOne())
-        Result = Builder.CreateMul(Result, LinearMultiplier, "switch.idx.mult");
-      if (!LinearOffset->isZero())
-        Result = Builder.CreateAdd(Result, LinearOffset, "switch.offset");
-      return Result;
-    }
-    case BitMapKind: {
-      // Type of the bitmap (e.g. i59).
-      IntegerType *MapTy = BitMap->getType();
-
-      // Cast Index to the same type as the bitmap.
-      // Note: The Index is <= the number of elements in the table, so
-      // truncating it to the width of the bitmask is safe.
-      Value *ShiftAmt = Builder.CreateZExtOrTrunc(Index, MapTy, "switch.cast");
-
-      // Multiply the shift amount by the element width.
-      ShiftAmt = Builder.CreateMul(ShiftAmt,
-                      ConstantInt::get(MapTy, BitMapElementTy->getBitWidth()),
-                                   "switch.shiftamt");
-
-      // Shift down.
-      Value *DownShifted = Builder.CreateLShr(BitMap, ShiftAmt,
-                                              "switch.downshift");
-      // Mask off.
-      return Builder.CreateTrunc(DownShifted, BitMapElementTy,
-                                 "switch.masked");
-    }
-    case ArrayKind: {
-      // Make sure the table index will not overflow when treated as signed.
-      IntegerType *IT = cast<IntegerType>(Index->getType());
-      uint64_t TableSize = Array->getInitializer()->getType()
-                                ->getArrayNumElements();
-      if (TableSize > (1ULL << (IT->getBitWidth() - 1)))
-        Index = Builder.CreateZExt(Index,
-                                   IntegerType::get(IT->getContext(),
-                                                    IT->getBitWidth() + 1),
-                                   "switch.tableidx.zext");
-
-      Value *GEPIndices[] = { Builder.getInt32(0), Index };
-      Value *GEP = Builder.CreateInBoundsGEP(Array->getValueType(), Array,
-                                             GEPIndices, "switch.gep");
-      return Builder.CreateLoad(GEP, "switch.load");
-    }
+  case SingleValueKind:
+    return SingleValue;
+  case LinearMapKind: {
+    // Derive the result value from the input value.
+    Value *Result = Builder.CreateIntCast(Index, LinearMultiplier->getType(),
+                                          false, "switch.idx.cast");
+    if (!LinearMultiplier->isOne())
+      Result = Builder.CreateMul(Result, LinearMultiplier, "switch.idx.mult");
+    if (!LinearOffset->isZero())
+      Result = Builder.CreateAdd(Result, LinearOffset, "switch.offset");
+    return Result;
+  }
+  case BitMapKind: {
+    // Type of the bitmap (e.g. i59).
+    IntegerType *MapTy = BitMap->getType();
+
+    // Cast Index to the same type as the bitmap.
+    // Note: The Index is <= the number of elements in the table, so
+    // truncating it to the width of the bitmask is safe.
+    Value *ShiftAmt = Builder.CreateZExtOrTrunc(Index, MapTy, "switch.cast");
+
+    // Multiply the shift amount by the element width.
+    ShiftAmt = Builder.CreateMul(
+        ShiftAmt, ConstantInt::get(MapTy, BitMapElementTy->getBitWidth()),
+        "switch.shiftamt");
+
+    // Shift down.
+    Value *DownShifted =
+        Builder.CreateLShr(BitMap, ShiftAmt, "switch.downshift");
+    // Mask off.
+    return Builder.CreateTrunc(DownShifted, BitMapElementTy, "switch.masked");
+  }
+  case ArrayKind: {
+    // Make sure the table index will not overflow when treated as signed.
+    IntegerType *IT = cast<IntegerType>(Index->getType());
+    uint64_t TableSize =
+        Array->getInitializer()->getType()->getArrayNumElements();
+    if (TableSize > (1ULL << (IT->getBitWidth() - 1)))
+      Index = Builder.CreateZExt(
+          Index, IntegerType::get(IT->getContext(), IT->getBitWidth() + 1),
+          "switch.tableidx.zext");
+
+    Value *GEPIndices[] = {Builder.getInt32(0), Index};
+    Value *GEP = Builder.CreateInBoundsGEP(Array->getValueType(), Array,
+                                           GEPIndices, "switch.gep");
+    return Builder.CreateLoad(GEP, "switch.load");
+  }
   }
   llvm_unreachable("Unknown lookup table kind!");
 }
@@ -4480,7 +4672,7 @@ bool SwitchLookupTable::WouldFitInRegister(const DataLayout &DL,
   // are <= 15, we could try to narrow the type.
 
   // Avoid overflow, fitsInLegalInteger uses unsigned int for the width.
-  if (TableSize >= UINT_MAX/IT->getBitWidth())
+  if (TableSize >= UINT_MAX / IT->getBitWidth())
     return false;
   return DL.fitsInLegalInteger(TableSize * IT->getBitWidth());
 }
@@ -4503,8 +4695,9 @@ ShouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize,
     HasIllegalType = HasIllegalType || !TTI.isTypeLegal(Ty);
 
     // Saturate this flag to false.
-    AllTablesFitInRegister = AllTablesFitInRegister &&
-      SwitchLookupTable::WouldFitInRegister(DL, TableSize, Ty);
+    AllTablesFitInRegister =
+        AllTablesFitInRegister &&
+        SwitchLookupTable::WouldFitInRegister(DL, TableSize, Ty);
 
     // If both flags saturate, we're done. NOTE: This *only* works with
     // saturating flags, and all flags have to saturate first due to the
@@ -4547,9 +4740,10 @@ ShouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize,
 ///        ...
 /// \endcode
 /// Jump threading will then eliminate the second if(cond).
-static void reuseTableCompare(User *PhiUser, BasicBlock *PhiBlock,
-          BranchInst *RangeCheckBranch, Constant *DefaultValue,
-          const SmallVectorImpl<std::pair<ConstantInt*, Constant*> >& Values) {
+static void reuseTableCompare(
+    User *PhiUser, BasicBlock *PhiBlock, BranchInst *RangeCheckBranch,
+    Constant *DefaultValue,
+    const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values) {
 
   ICmpInst *CmpInst = dyn_cast<ICmpInst>(PhiUser);
   if (!CmpInst)
@@ -4578,13 +4772,13 @@ static void reuseTableCompare(User *PhiUser, BasicBlock *PhiBlock,
   // compare result.
   for (auto ValuePair : Values) {
     Constant *CaseConst = ConstantExpr::getICmp(CmpInst->getPredicate(),
-                              ValuePair.second, CmpOp1, true);
+                                                ValuePair.second, CmpOp1, true);
     if (!CaseConst || CaseConst == DefaultConst)
       return;
     assert((CaseConst == TrueConst || CaseConst == FalseConst) &&
            "Expect true or false as compare result.");
   }
-  
+
   // Check if the branch instruction dominates the phi node. It's a simple
   // dominance check, but sufficient for our needs.
   // Although this check is invariant in the calling loops, it's better to do it
@@ -4602,9 +4796,9 @@ static void reuseTableCompare(User *PhiUser, BasicBlock *PhiBlock,
     ++NumTableCmpReuses;
   } else {
     // The compare yields the same result, just inverted. We can replace it.
-    Value *InvertedTableCmp = BinaryOperator::CreateXor(RangeCmp,
-                ConstantInt::get(RangeCmp->getType(), 1), "inverted.cmp",
-                RangeCheckBranch);
+    Value *InvertedTableCmp = BinaryOperator::CreateXor(
+        RangeCmp, ConstantInt::get(RangeCmp->getType(), 1), "inverted.cmp",
+        RangeCheckBranch);
     CmpInst->replaceAllUsesWith(InvertedTableCmp);
     ++NumTableCmpReuses;
   }
@@ -4629,7 +4823,8 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
   // GEP needs a runtime relocation in PIC code. We should just build one big
   // string and lookup indices into that.
 
-  // Ignore switches with less than three cases. Lookup tables will not make them
+  // Ignore switches with less than three cases. Lookup tables will not make
+  // them
   // faster, so we don't analyze them.
   if (SI->getNumCases() < 3)
     return false;
@@ -4642,11 +4837,11 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
   ConstantInt *MaxCaseVal = CI.getCaseValue();
 
   BasicBlock *CommonDest = nullptr;
-  typedef SmallVector<std::pair<ConstantInt*, Constant*>, 4> ResultListTy;
-  SmallDenseMap<PHINode*, ResultListTy> ResultLists;
-  SmallDenseMap<PHINode*, Constant*> DefaultResults;
-  SmallDenseMap<PHINode*, Type*> ResultTypes;
-  SmallVector<PHINode*, 4> PHIs;
+  typedef SmallVector<std::pair<ConstantInt *, Constant *>, 4> ResultListTy;
+  SmallDenseMap<PHINode *, ResultListTy> ResultLists;
+  SmallDenseMap<PHINode *, Constant *> DefaultResults;
+  SmallDenseMap<PHINode *, Type *> ResultTypes;
+  SmallVector<PHINode *, 4> PHIs;
 
   for (SwitchInst::CaseIt E = SI->case_end(); CI != E; ++CI) {
     ConstantInt *CaseVal = CI.getCaseValue();
@@ -4656,7 +4851,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
       MaxCaseVal = CaseVal;
 
     // Resulting value at phi nodes for this case value.
-    typedef SmallVector<std::pair<PHINode*, Constant*>, 4> ResultsTy;
+    typedef SmallVector<std::pair<PHINode *, Constant *>, 4> ResultsTy;
     ResultsTy Results;
     if (!GetCaseResults(SI, CaseVal, CI.getCaseSuccessor(), &CommonDest,
                         Results, DL))
@@ -4684,14 +4879,14 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
 
   // If the table has holes, we need a constant result for the default case
   // or a bitmask that fits in a register.
-  SmallVector<std::pair<PHINode*, Constant*>, 4> DefaultResultsList;
+  SmallVector<std::pair<PHINode *, Constant *>, 4> DefaultResultsList;
   bool HasDefaultResults = GetCaseResults(SI, nullptr, SI->getDefaultDest(),
                                           &CommonDest, DefaultResultsList, DL);
 
   bool NeedMask = (TableHasHoles && !HasDefaultResults);
   if (NeedMask) {
     // As an extra penalty for the validity test we require more cases.
-    if (SI->getNumCases() < 4)  // FIXME: Find best threshold value (benchmark).
+    if (SI->getNumCases() < 4) // FIXME: Find best threshold value (benchmark).
       return false;
     if (!DL.fitsInLegalInteger(TableSize))
       return false;
@@ -4708,15 +4903,13 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
 
   // Create the BB that does the lookups.
   Module &Mod = *CommonDest->getParent()->getParent();
-  BasicBlock *LookupBB = BasicBlock::Create(Mod.getContext(),
-                                            "switch.lookup",
-                                            CommonDest->getParent(),
-                                            CommonDest);
+  BasicBlock *LookupBB = BasicBlock::Create(
+      Mod.getContext(), "switch.lookup", CommonDest->getParent(), CommonDest);
 
   // Compute the table index value.
   Builder.SetInsertPoint(SI);
-  Value *TableIndex = Builder.CreateSub(SI->getCondition(), MinCaseVal,
-                                        "switch.tableidx");
+  Value *TableIndex =
+      Builder.CreateSub(SI->getCondition(), MinCaseVal, "switch.tableidx");
 
   // Compute the maximum table size representable by the integer type we are
   // switching upon.
@@ -4739,9 +4932,10 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
     // Note: We call removeProdecessor later since we need to be able to get the
     // PHI value for the default case in case we're using a bit mask.
   } else {
-    Value *Cmp = Builder.CreateICmpULT(TableIndex, ConstantInt::get(
-                                       MinCaseVal->getType(), TableSize));
-    RangeCheckBranch = Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest());
+    Value *Cmp = Builder.CreateICmpULT(
+        TableIndex, ConstantInt::get(MinCaseVal->getType(), TableSize));
+    RangeCheckBranch =
+        Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest());
   }
 
   // Populate the BB that does the lookups.
@@ -4753,10 +4947,8 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
     // and we create a new LookupBB.
     BasicBlock *MaskBB = LookupBB;
     MaskBB->setName("switch.hole_check");
-    LookupBB = BasicBlock::Create(Mod.getContext(),
-                                  "switch.lookup",
-                                  CommonDest->getParent(),
-                                  CommonDest);
+    LookupBB = BasicBlock::Create(Mod.getContext(), "switch.lookup",
+                                  CommonDest->getParent(), CommonDest);
 
     // Make the mask's bitwidth at least 8bit and a power-of-2 to avoid
     // unnecessary illegal types.
@@ -4766,8 +4958,8 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
     // Build bitmask; fill in a 1 bit for every case.
     const ResultListTy &ResultList = ResultLists[PHIs[0]];
     for (size_t I = 0, E = ResultList.size(); I != E; ++I) {
-      uint64_t Idx = (ResultList[I].first->getValue() -
-                      MinCaseVal->getValue()).getLimitedValue();
+      uint64_t Idx = (ResultList[I].first->getValue() - MinCaseVal->getValue())
+                         .getLimitedValue();
       MaskInt |= One << Idx;
     }
     ConstantInt *TableMask = ConstantInt::get(Mod.getContext(), MaskInt);
@@ -4776,13 +4968,11 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
     // If this bit is 0 (meaning hole) jump to the default destination,
     // else continue with table lookup.
     IntegerType *MapTy = TableMask->getType();
-    Value *MaskIndex = Builder.CreateZExtOrTrunc(TableIndex, MapTy,
-                                                 "switch.maskindex");
-    Value *Shifted = Builder.CreateLShr(TableMask, MaskIndex,
-                                        "switch.shifted");
-    Value *LoBit = Builder.CreateTrunc(Shifted,
-                                       Type::getInt1Ty(Mod.getContext()),
-                                       "switch.lobit");
+    Value *MaskIndex =
+        Builder.CreateZExtOrTrunc(TableIndex, MapTy, "switch.maskindex");
+    Value *Shifted = Builder.CreateLShr(TableMask, MaskIndex, "switch.shifted");
+    Value *LoBit = Builder.CreateTrunc(
+        Shifted, Type::getInt1Ty(Mod.getContext()), "switch.lobit");
     Builder.CreateCondBr(LoBit, LookupBB, SI->getDefaultDest());
 
     Builder.SetInsertPoint(LookupBB);
@@ -4905,7 +5095,8 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
     if (!Dest->hasAddressTaken() || !Succs.insert(Dest).second) {
       Dest->removePredecessor(BB);
       IBI->removeDestination(i);
-      --i; --e;
+      --i;
+      --e;
       Changed = true;
     }
   }
@@ -4968,27 +5159,28 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
     LandingPadInst *LPad2 = dyn_cast<LandingPadInst>(I);
     if (!LPad2 || !LPad2->isIdenticalTo(LPad))
       continue;
-    for (++I; isa<DbgInfoIntrinsic>(I); ++I) {}
+    for (++I; isa<DbgInfoIntrinsic>(I); ++I) {
+    }
     BranchInst *BI2 = dyn_cast<BranchInst>(I);
     if (!BI2 || !BI2->isIdenticalTo(BI))
       continue;
 
-    // We've found an identical block.  Update our predeccessors to take that
+    // We've found an identical block.  Update our predecessors to take that
     // path instead and make ourselves dead.
     SmallSet<BasicBlock *, 16> Preds;
     Preds.insert(pred_begin(BB), pred_end(BB));
     for (BasicBlock *Pred : Preds) {
       InvokeInst *II = cast<InvokeInst>(Pred->getTerminator());
-      assert(II->getNormalDest() != BB &&
-             II->getUnwindDest() == BB && "unexpected successor");
+      assert(II->getNormalDest() != BB && II->getUnwindDest() == BB &&
+             "unexpected successor");
       II->setUnwindDest(OtherPred);
     }
 
     // The debug info in OtherPred doesn't cover the merged control flow that
     // used to go through BB.  We need to delete it or update it.
-    for (auto I = OtherPred->begin(), E = OtherPred->end();
-         I != E;) {
-      Instruction &Inst = *I; I++;
+    for (auto I = OtherPred->begin(), E = OtherPred->end(); I != E;) {
+      Instruction &Inst = *I;
+      I++;
       if (isa<DbgInfoIntrinsic>(Inst))
         Inst.eraseFromParent();
     }
@@ -5007,15 +5199,22 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
   return false;
 }
 
-bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
+bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI,
+                                          IRBuilder<> &Builder) {
   BasicBlock *BB = BI->getParent();
 
   if (SinkCommon && SinkThenElseCodeToEnd(BI))
     return true;
 
   // If the Terminator is the only non-phi instruction, simplify the block.
+  // if LoopHeader is provided, check if the block is a loop header
+  // (This is for early invocations before loop simplify and vectorization
+  // to keep canonical loop forms for nested loops.
+  // These blocks can be eliminated when the pass is invoked later
+  // in the back-end.)
   BasicBlock::iterator I = BB->getFirstNonPHIOrDbg()->getIterator();
   if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() &&
+      (!LoopHeaders || !LoopHeaders->count(BB)) &&
       TryToSimplifyUncondBranchFromEmptyBlock(BB))
     return true;
 
@@ -5034,9 +5233,9 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
   // See if we can merge an empty landing pad block with another which is
   // equivalent.
   if (LandingPadInst *LPad = dyn_cast<LandingPadInst>(I)) {
-    for (++I; isa<DbgInfoIntrinsic>(I); ++I) {}
-    if (I->isTerminator() &&
-        TryToMergeLandingPad(LPad, BI, BB))
+    for (++I; isa<DbgInfoIntrinsic>(I); ++I) {
+    }
+    if (I->isTerminator() && TryToMergeLandingPad(LPad, BI, BB))
       return true;
   }
 
@@ -5081,7 +5280,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
     if (&*I == BI) {
       if (FoldValueComparisonIntoPredecessors(BI, Builder))
         return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
-    } else if (&*I == cast<Instruction>(BI->getCondition())){
+    } else if (&*I == cast<Instruction>(BI->getCondition())) {
       ++I;
       // Ignore dbg intrinsics.
       while (isa<DbgInfoIntrinsic>(I))
@@ -5095,6 +5294,30 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   if (SimplifyBranchOnICmpChain(BI, Builder, DL))
     return true;
 
+  // If this basic block has a single dominating predecessor block and the
+  // dominating block's condition implies BI's condition, we know the direction
+  // of the BI branch.
+  if (BasicBlock *Dom = BB->getSinglePredecessor()) {
+    auto *PBI = dyn_cast_or_null<BranchInst>(Dom->getTerminator());
+    if (PBI && PBI->isConditional() &&
+        PBI->getSuccessor(0) != PBI->getSuccessor(1) &&
+        (PBI->getSuccessor(0) == BB || PBI->getSuccessor(1) == BB)) {
+      bool CondIsFalse = PBI->getSuccessor(1) == BB;
+      Optional<bool> Implication = isImpliedCondition(
+          PBI->getCondition(), BI->getCondition(), DL, CondIsFalse);
+      if (Implication) {
+        // Turn this into a branch on constant.
+        auto *OldCond = BI->getCondition();
+        ConstantInt *CI = *Implication
+                              ? ConstantInt::getTrue(BB->getContext())
+                              : ConstantInt::getFalse(BB->getContext());
+        BI->setCondition(CI);
+        RecursivelyDeleteTriviallyDeadInstructions(OldCond);
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
+      }
+    }
+  }
+
   // If this basic block is ONLY a compare and a branch, and if a predecessor
   // branches to us and one of our successors, fold the comparison into the
   // predecessor and use logical operations to pick the right destination.
@@ -5149,7 +5372,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
         if (PBI != BI && PBI->isConditional())
           if (mergeConditionalStores(PBI, BI))
             return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
-  
+
   return false;
 }
 
@@ -5162,7 +5385,7 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) {
   if (I->use_empty())
     return false;
 
-  if (C->isNullValue()) {
+  if (C->isNullValue() || isa<UndefValue>(C)) {
     // Only look at the first use, avoid hurting compile time with long uselists
     User *Use = *I->user_begin();
 
@@ -5189,7 +5412,12 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) {
     // Store to null is undefined.
     if (StoreInst *SI = dyn_cast<StoreInst>(Use))
       if (!SI->isVolatile())
-        return SI->getPointerAddressSpace() == 0 && SI->getPointerOperand() == I;
+        return SI->getPointerAddressSpace() == 0 &&
+               SI->getPointerOperand() == I;
+
+    // A call to null is undefined.
+    if (auto CS = CallSite(Use))
+      return CS.getCalledValue() == I;
   }
   return false;
 }
@@ -5210,8 +5438,8 @@ static bool removeUndefIntroducingPredecessor(BasicBlock *BB) {
           if (BI->isUnconditional())
             Builder.CreateUnreachable();
           else
-            Builder.CreateBr(BI->getSuccessor(0) == BB ? BI->getSuccessor(1) :
-                                                         BI->getSuccessor(0));
+            Builder.CreateBr(BI->getSuccessor(0) == BB ? BI->getSuccessor(1)
+                                                       : BI->getSuccessor(0));
           BI->eraseFromParent();
           return true;
         }
@@ -5229,8 +5457,7 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
 
   // Remove basic blocks that have no predecessors (except the entry block)...
   // or that just have themself as a predecessor.  These are unreachable.
-  if ((pred_empty(BB) &&
-       BB != &BB->getParent()->getEntryBlock()) ||
+  if ((pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) ||
       BB->getSinglePredecessor() == BB) {
     DEBUG(dbgs() << "Removing BB: \n" << *BB);
     DeleteDeadBlock(BB);
@@ -5265,25 +5492,33 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
   Builder.SetInsertPoint(BB->getTerminator());
   if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
     if (BI->isUnconditional()) {
-      if (SimplifyUncondBranch(BI, Builder)) return true;
+      if (SimplifyUncondBranch(BI, Builder))
+        return true;
     } else {
-      if (SimplifyCondBranch(BI, Builder)) return true;
+      if (SimplifyCondBranch(BI, Builder))
+        return true;
     }
   } else if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
-    if (SimplifyReturn(RI, Builder)) return true;
+    if (SimplifyReturn(RI, Builder))
+      return true;
   } else if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator())) {
-    if (SimplifyResume(RI, Builder)) return true;
+    if (SimplifyResume(RI, Builder))
+      return true;
   } else if (CleanupReturnInst *RI =
-               dyn_cast<CleanupReturnInst>(BB->getTerminator())) {
-    if (SimplifyCleanupReturn(RI)) return true;
+                 dyn_cast<CleanupReturnInst>(BB->getTerminator())) {
+    if (SimplifyCleanupReturn(RI))
+      return true;
   } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
-    if (SimplifySwitch(SI, Builder)) return true;
+    if (SimplifySwitch(SI, Builder))
+      return true;
   } else if (UnreachableInst *UI =
-               dyn_cast<UnreachableInst>(BB->getTerminator())) {
-    if (SimplifyUnreachable(UI)) return true;
+                 dyn_cast<UnreachableInst>(BB->getTerminator())) {
+    if (SimplifyUnreachable(UI))
+      return true;
   } else if (IndirectBrInst *IBI =
-               dyn_cast<IndirectBrInst>(BB->getTerminator())) {
-    if (SimplifyIndirectBr(IBI)) return true;
+                 dyn_cast<IndirectBrInst>(BB->getTerminator())) {
+    if (SimplifyIndirectBr(IBI))
+      return true;
   }
 
   return Changed;
@@ -5295,7 +5530,9 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
 /// of the CFG.  It returns true if a modification was made.
 ///
 bool llvm::SimplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI,
-                       unsigned BonusInstThreshold, AssumptionCache *AC) {
+                       unsigned BonusInstThreshold, AssumptionCache *AC,
+                       SmallPtrSetImpl<BasicBlock *> *LoopHeaders) {
   return SimplifyCFGOpt(TTI, BB->getModule()->getDataLayout(),
-                        BonusInstThreshold, AC).run(BB);
+                        BonusInstThreshold, AC, LoopHeaders)
+      .run(BB);
 }
diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index ddd8775a8431d..6b1d3dc413305 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -25,7 +25,6 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -71,14 +70,12 @@ namespace {
 
     bool eliminateIdentitySCEV(Instruction *UseInst, Instruction *IVOperand);
 
+    bool eliminateOverflowIntrinsic(CallInst *CI);
     bool eliminateIVUser(Instruction *UseInst, Instruction *IVOperand);
     void eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand);
     void eliminateIVRemainder(BinaryOperator *Rem, Value *IVOperand,
                               bool IsSigned);
     bool strengthenOverflowingOperation(BinaryOperator *OBO, Value *IVOperand);
-
-    Instruction *splitOverflowIntrinsic(Instruction *IVUser,
-                                        const DominatorTree *DT);
   };
 }
 
@@ -183,9 +180,8 @@ void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
     DeadInsts.emplace_back(ICmp);
     DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
   } else if (isa<PHINode>(IVOperand) &&
-             SE->isLoopInvariantPredicate(Pred, S, X, ICmpLoop,
-                                          InvariantPredicate, InvariantLHS,
-                                          InvariantRHS)) {
+             SE->isLoopInvariantPredicate(Pred, S, X, L, InvariantPredicate,
+                                          InvariantLHS, InvariantRHS)) {
 
     // Rewrite the comparison to a loop invariant comparison if it can be done
     // cheaply, where cheaply means "we don't need to emit any new
@@ -201,9 +197,48 @@ void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
       NewRHS =
           ICmp->getOperand(S == InvariantRHS ? IVOperIdx : (1 - IVOperIdx));
 
-    for (Value *Incoming : cast<PHINode>(IVOperand)->incoming_values()) {
-      if (NewLHS && NewRHS)
-        break;
+    auto *PN = cast<PHINode>(IVOperand);
+    for (unsigned i = 0, e = PN->getNumIncomingValues();
+         i != e && (!NewLHS || !NewRHS);
+         ++i) {
+
+      // If this is a value incoming from the backedge, then it cannot be a loop
+      // invariant value (since we know that IVOperand is an induction variable).
+      if (L->contains(PN->getIncomingBlock(i)))
+        continue;
+
+      // NB! This following assert does not fundamentally have to be true, but
+      // it is true today given how SCEV analyzes induction variables.
+      // Specifically, today SCEV will *not* recognize %iv as an induction
+      // variable in the following case:
+      //
+      // define void @f(i32 %k) {
+      // entry:
+      //   br i1 undef, label %r, label %l
+      //
+      // l:
+      //   %k.inc.l = add i32 %k, 1
+      //   br label %loop
+      //
+      // r:
+      //   %k.inc.r = add i32 %k, 1
+      //   br label %loop
+      //
+      // loop:
+      //   %iv = phi i32 [ %k.inc.l, %l ], [ %k.inc.r, %r ], [ %iv.inc, %loop ]
+      //   %iv.inc = add i32 %iv, 1
+      //   br label %loop
+      // }
+      //
+      // but if it starts to, at some point, then the assertion below will have
+      // to be changed to a runtime check.
+
+      Value *Incoming = PN->getIncomingValue(i);
+
+#ifndef NDEBUG
+      if (auto *I = dyn_cast<Instruction>(Incoming))
+        assert(DT->dominates(I, ICmp) && "Should be a unique loop dominating value!");
+#endif
 
       const SCEV *IncomingS = SE->getSCEV(Incoming);
 
@@ -280,6 +315,108 @@ void SimplifyIndvar::eliminateIVRemainder(BinaryOperator *Rem,
   DeadInsts.emplace_back(Rem);
 }
 
+bool SimplifyIndvar::eliminateOverflowIntrinsic(CallInst *CI) {
+  auto *F = CI->getCalledFunction();
+  if (!F)
+    return false;
+
+  typedef const SCEV *(ScalarEvolution::*OperationFunctionTy)(
+      const SCEV *, const SCEV *, SCEV::NoWrapFlags);
+  typedef const SCEV *(ScalarEvolution::*ExtensionFunctionTy)(
+      const SCEV *, Type *);
+
+  OperationFunctionTy Operation;
+  ExtensionFunctionTy Extension;
+
+  Instruction::BinaryOps RawOp;
+
+  // We always have exactly one of nsw or nuw.  If NoSignedOverflow is false, we
+  // have nuw.
+  bool NoSignedOverflow;
+
+  switch (F->getIntrinsicID()) {
+  default:
+    return false;
+
+  case Intrinsic::sadd_with_overflow:
+    Operation = &ScalarEvolution::getAddExpr;
+    Extension = &ScalarEvolution::getSignExtendExpr;
+    RawOp = Instruction::Add;
+    NoSignedOverflow = true;
+    break;
+
+  case Intrinsic::uadd_with_overflow:
+    Operation = &ScalarEvolution::getAddExpr;
+    Extension = &ScalarEvolution::getZeroExtendExpr;
+    RawOp = Instruction::Add;
+    NoSignedOverflow = false;
+    break;
+
+  case Intrinsic::ssub_with_overflow:
+    Operation = &ScalarEvolution::getMinusSCEV;
+    Extension = &ScalarEvolution::getSignExtendExpr;
+    RawOp = Instruction::Sub;
+    NoSignedOverflow = true;
+    break;
+
+  case Intrinsic::usub_with_overflow:
+    Operation = &ScalarEvolution::getMinusSCEV;
+    Extension = &ScalarEvolution::getZeroExtendExpr;
+    RawOp = Instruction::Sub;
+    NoSignedOverflow = false;
+    break;
+  }
+
+  const SCEV *LHS = SE->getSCEV(CI->getArgOperand(0));
+  const SCEV *RHS = SE->getSCEV(CI->getArgOperand(1));
+
+  auto *NarrowTy = cast<IntegerType>(LHS->getType());
+  auto *WideTy =
+    IntegerType::get(NarrowTy->getContext(), NarrowTy->getBitWidth() * 2);
+
+  const SCEV *A =
+      (SE->*Extension)((SE->*Operation)(LHS, RHS, SCEV::FlagAnyWrap), WideTy);
+  const SCEV *B =
+      (SE->*Operation)((SE->*Extension)(LHS, WideTy),
+                       (SE->*Extension)(RHS, WideTy), SCEV::FlagAnyWrap);
+
+  if (A != B)
+    return false;
+
+  // Proved no overflow, nuke the overflow check and, if possible, the overflow
+  // intrinsic as well.
+
+  BinaryOperator *NewResult = BinaryOperator::Create(
+      RawOp, CI->getArgOperand(0), CI->getArgOperand(1), "", CI);
+
+  if (NoSignedOverflow)
+    NewResult->setHasNoSignedWrap(true);
+  else
+    NewResult->setHasNoUnsignedWrap(true);
+
+  SmallVector<ExtractValueInst *, 4> ToDelete;
+
+  for (auto *U : CI->users()) {
+    if (auto *EVI = dyn_cast<ExtractValueInst>(U)) {
+      if (EVI->getIndices()[0] == 1)
+        EVI->replaceAllUsesWith(ConstantInt::getFalse(CI->getContext()));
+      else {
+        assert(EVI->getIndices()[0] == 0 && "Only two possibilities!");
+        EVI->replaceAllUsesWith(NewResult);
+      }
+      ToDelete.push_back(EVI);
+    }
+  }
+
+  for (auto *EVI : ToDelete)
+    EVI->eraseFromParent();
+
+  if (CI->use_empty())
+    CI->eraseFromParent();
+
+  return true;
+}
+
 /// Eliminate an operation that consumes a simple IV and has no observable
 /// side-effect given the range of IV values.  IVOperand is guaranteed SCEVable,
 /// but UseInst may not be.
@@ -297,6 +434,10 @@ bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst,
     }
   }
 
+  if (auto *CI = dyn_cast<CallInst>(UseInst))
+    if (eliminateOverflowIntrinsic(CI))
+      return true;
+
   if (eliminateIdentitySCEV(UseInst, IVOperand))
     return true;
 
@@ -408,69 +549,6 @@ bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO,
   return Changed;
 }
 
-/// \brief Split sadd.with.overflow into add + sadd.with.overflow to allow
-/// analysis and optimization.
-///
-/// \return A new value representing the non-overflowing add if possible,
-/// otherwise return the original value.
-Instruction *SimplifyIndvar::splitOverflowIntrinsic(Instruction *IVUser,
-                                                    const DominatorTree *DT) {
-  IntrinsicInst *II = dyn_cast<IntrinsicInst>(IVUser);
-  if (!II || II->getIntrinsicID() != Intrinsic::sadd_with_overflow)
-    return IVUser;
-
-  // Find a branch guarded by the overflow check.
-  BranchInst *Branch = nullptr;
-  Instruction *AddVal = nullptr;
-  for (User *U : II->users()) {
-    if (ExtractValueInst *ExtractInst = dyn_cast<ExtractValueInst>(U)) {
-      if (ExtractInst->getNumIndices() != 1)
-        continue;
-      if (ExtractInst->getIndices()[0] == 0)
-        AddVal = ExtractInst;
-      else if (ExtractInst->getIndices()[0] == 1 && ExtractInst->hasOneUse())
-        Branch = dyn_cast<BranchInst>(ExtractInst->user_back());
-    }
-  }
-  if (!AddVal || !Branch)
-    return IVUser;
-
-  BasicBlock *ContinueBB = Branch->getSuccessor(1);
-  if (std::next(pred_begin(ContinueBB)) != pred_end(ContinueBB))
-    return IVUser;
-
-  // Check if all users of the add are provably NSW.
-  bool AllNSW = true;
-  for (Use &U : AddVal->uses()) {
-    if (Instruction *UseInst = dyn_cast<Instruction>(U.getUser())) {
-      BasicBlock *UseBB = UseInst->getParent();
-      if (PHINode *PHI = dyn_cast<PHINode>(UseInst))
-        UseBB = PHI->getIncomingBlock(U);
-      if (!DT->dominates(ContinueBB, UseBB)) {
-        AllNSW = false;
-        break;
-      }
-    }
-  }
-  if (!AllNSW)
-    return IVUser;
-
-  // Go for it...
-  IRBuilder<> Builder(IVUser);
-  Instruction *AddInst = dyn_cast<Instruction>(
-    Builder.CreateNSWAdd(II->getOperand(0), II->getOperand(1)));
-
-  // The caller expects the new add to have the same form as the intrinsic. The
-  // IV operand position must be the same.
-  assert((AddInst->getOpcode() == Instruction::Add &&
-          AddInst->getOperand(0) == II->getOperand(0)) &&
-         "Bad add instruction created from overflow intrinsic.");
-
-  AddVal->replaceAllUsesWith(AddInst);
-  DeadInsts.emplace_back(AddVal);
-  return AddInst;
-}
-
 /// Add all uses of Def to the current IV's worklist.
 static void pushIVUsers(
   Instruction *Def,
@@ -545,12 +623,6 @@ void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
     // Bypass back edges to avoid extra work.
     if (UseInst == CurrIV) continue;
 
-    if (V && V->shouldSplitOverflowInstrinsics()) {
-      UseInst = splitOverflowIntrinsic(UseInst, V->getDomTree());
-      if (!UseInst)
-        continue;
-    }
-
     Instruction *IVOperand = UseOper.second;
     for (unsigned N = 0; IVOperand; ++N) {
       assert(N <= Simplified.size() && "runaway iteration");
diff --git a/lib/Transforms/Utils/SimplifyInstructions.cpp b/lib/Transforms/Utils/SimplifyInstructions.cpp
index d5377f9a4c1ff..df299067094f6 100644
--- a/lib/Transforms/Utils/SimplifyInstructions.cpp
+++ b/lib/Transforms/Utils/SimplifyInstructions.cpp
@@ -14,7 +14,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/SimplifyInstructions.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
@@ -27,12 +27,60 @@
 #include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Scalar.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "instsimplify"
 
 STATISTIC(NumSimplified, "Number of redundant instructions removed");
 
+static bool runImpl(Function &F, const DominatorTree *DT, const TargetLibraryInfo *TLI,
+                    AssumptionCache *AC) {
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  SmallPtrSet<const Instruction*, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
+  bool Changed = false;
+
+  do {
+    for (BasicBlock *BB : depth_first(&F.getEntryBlock()))
+      // Here be subtlety: the iterator must be incremented before the loop
+      // body (not sure why), so a range-for loop won't work here.
+      for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
+        Instruction *I = &*BI++;
+        // The first time through the loop ToSimplify is empty and we try to
+        // simplify all instructions.  On later iterations ToSimplify is not
+        // empty and we only bother simplifying instructions that are in it.
+        if (!ToSimplify->empty() && !ToSimplify->count(I))
+          continue;
+        // Don't waste time simplifying unused instructions.
+        if (!I->use_empty())
+          if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC)) {
+            // Mark all uses for resimplification next time round the loop.
+            for (User *U : I->users())
+              Next->insert(cast<Instruction>(U));
+            I->replaceAllUsesWith(V);
+            ++NumSimplified;
+            Changed = true;
+          }
+        bool res = RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
+        if (res)  {
+          // RecursivelyDeleteTriviallyDeadInstruction can remove
+          // more than one instruction, so simply incrementing the
+          // iterator does not work. When instructions get deleted
+          // re-iterate instead.
+          BI = BB->begin(); BE = BB->end();
+          Changed |= res;
+        }
+      }
+
+    // Place the list of instructions to simplify on the next loop iteration
+    // into ToSimplify.
+    std::swap(ToSimplify, Next);
+    Next->clear();
+  } while (!ToSimplify->empty());
+
+  return Changed;
+}
+
 namespace {
   struct InstSimplifier : public FunctionPass {
     static char ID; // Pass identification, replacement for typeid
@@ -48,56 +96,17 @@ namespace {
 
     /// runOnFunction - Remove instructions that simplify.
     bool runOnFunction(Function &F) override {
+      if (skipFunction(F))
+        return false;
+
       const DominatorTreeWrapperPass *DTWP =
           getAnalysisIfAvailable<DominatorTreeWrapperPass>();
       const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
-      const DataLayout &DL = F.getParent()->getDataLayout();
       const TargetLibraryInfo *TLI =
           &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
       AssumptionCache *AC =
           &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-      SmallPtrSet<const Instruction*, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
-      bool Changed = false;
-
-      do {
-        for (BasicBlock *BB : depth_first(&F.getEntryBlock()))
-          // Here be subtlety: the iterator must be incremented before the loop
-          // body (not sure why), so a range-for loop won't work here.
-          for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
-            Instruction *I = &*BI++;
-            // The first time through the loop ToSimplify is empty and we try to
-            // simplify all instructions.  On later iterations ToSimplify is not
-            // empty and we only bother simplifying instructions that are in it.
-            if (!ToSimplify->empty() && !ToSimplify->count(I))
-              continue;
-            // Don't waste time simplifying unused instructions.
-            if (!I->use_empty())
-              if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC)) {
-                // Mark all uses for resimplification next time round the loop.
-                for (User *U : I->users())
-                  Next->insert(cast<Instruction>(U));
-                I->replaceAllUsesWith(V);
-                ++NumSimplified;
-                Changed = true;
-              }
-            bool res = RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
-            if (res)  {
-              // RecursivelyDeleteTriviallyDeadInstruction can remove
-              // more than one instruction, so simply incrementing the
-              // iterator does not work. When instructions get deleted
-              // re-iterate instead.
-              BI = BB->begin(); BE = BB->end();
-              Changed |= res;
-            }
-          }
-
-        // Place the list of instructions to simplify on the next loop iteration
-        // into ToSimplify.
-        std::swap(ToSimplify, Next);
-        Next->clear();
-      } while (!ToSimplify->empty());
-
-      return Changed;
+      return runImpl(F, DT, TLI, AC);
     }
   };
 }
@@ -115,3 +124,15 @@ char &llvm::InstructionSimplifierID = InstSimplifier::ID;
 FunctionPass *llvm::createInstructionSimplifierPass() {
   return new InstSimplifier();
 }
+
+PreservedAnalyses InstSimplifierPass::run(Function &F,
+                                      AnalysisManager<Function> &AM) {
+  auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  bool Changed = runImpl(F, DT, &TLI, &AC);
+  if (!Changed)
+    return PreservedAnalyses::all();
+  // FIXME: This should also 'preserve the CFG'.
+  return PreservedAnalyses::none();
+}
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 2f3c31128cf03..c2986951e48fd 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -29,7 +29,6 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -104,101 +103,11 @@ static bool hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
   }
 }
 
-/// \brief Check whether we can use unsafe floating point math for
-/// the function passed as input.
-static bool canUseUnsafeFPMath(Function *F) {
-
-  // FIXME: For finer-grain optimization, we need intrinsics to have the same
-  // fast-math flag decorations that are applied to FP instructions. For now,
-  // we have to rely on the function-level unsafe-fp-math attribute to do this
-  // optimization because there's no other way to express that the call can be
-  // relaxed.
-  if (F->hasFnAttribute("unsafe-fp-math")) {
-    Attribute Attr = F->getFnAttribute("unsafe-fp-math");
-    if (Attr.getValueAsString() == "true")
-      return true;
-  }
-  return false;
-}
-
-/// \brief Returns whether \p F matches the signature expected for the
-/// string/memory copying library function \p Func.
-/// Acceptable functions are st[rp][n]?cpy, memove, memcpy, and memset.
-/// Their fortified (_chk) counterparts are also accepted.
-static bool checkStringCopyLibFuncSignature(Function *F, LibFunc::Func Func) {
-  const DataLayout &DL = F->getParent()->getDataLayout();
-  FunctionType *FT = F->getFunctionType();
-  LLVMContext &Context = F->getContext();
-  Type *PCharTy = Type::getInt8PtrTy(Context);
-  Type *SizeTTy = DL.getIntPtrType(Context);
-  unsigned NumParams = FT->getNumParams();
-
-  // All string libfuncs return the same type as the first parameter.
-  if (FT->getReturnType() != FT->getParamType(0))
-    return false;
-
-  switch (Func) {
-  default:
-    llvm_unreachable("Can't check signature for non-string-copy libfunc.");
-  case LibFunc::stpncpy_chk:
-  case LibFunc::strncpy_chk:
-    --NumParams; // fallthrough
-  case LibFunc::stpncpy:
-  case LibFunc::strncpy: {
-    if (NumParams != 3 || FT->getParamType(0) != FT->getParamType(1) ||
-        FT->getParamType(0) != PCharTy || !FT->getParamType(2)->isIntegerTy())
-      return false;
-    break;
-  }
-  case LibFunc::strcpy_chk:
-  case LibFunc::stpcpy_chk:
-    --NumParams; // fallthrough
-  case LibFunc::stpcpy:
-  case LibFunc::strcpy: {
-    if (NumParams != 2 || FT->getParamType(0) != FT->getParamType(1) ||
-        FT->getParamType(0) != PCharTy)
-      return false;
-    break;
-  }
-  case LibFunc::memmove_chk:
-  case LibFunc::memcpy_chk:
-    --NumParams; // fallthrough
-  case LibFunc::memmove:
-  case LibFunc::memcpy: {
-    if (NumParams != 3 || !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy() || FT->getParamType(2) != SizeTTy)
-      return false;
-    break;
-  }
-  case LibFunc::memset_chk:
-    --NumParams; // fallthrough
-  case LibFunc::memset: {
-    if (NumParams != 3 || !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isIntegerTy() || FT->getParamType(2) != SizeTTy)
-      return false;
-    break;
-  }
-  }
-  // If this is a fortified libcall, the last parameter is a size_t.
-  if (NumParams == FT->getNumParams() - 1)
-    return FT->getParamType(FT->getNumParams() - 1) == SizeTTy;
-  return true;
-}
-
 //===----------------------------------------------------------------------===//
 // String and Memory Library Call Optimizations
 //===----------------------------------------------------------------------===//
 
 Value *LibCallSimplifier::optimizeStrCat(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  // Verify the "strcat" function prototype.
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 2||
-      FT->getReturnType() != B.getInt8PtrTy() ||
-      FT->getParamType(0) != FT->getReturnType() ||
-      FT->getParamType(1) != FT->getReturnType())
-    return nullptr;
-
   // Extract some information from the instruction
   Value *Dst = CI->getArgOperand(0);
   Value *Src = CI->getArgOperand(1);
@@ -220,7 +129,7 @@ Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len,
                                            IRBuilder<> &B) {
   // We need to find the end of the destination string.  That's where the
   // memory is to be moved to. We just generate a call to strlen.
-  Value *DstLen = EmitStrLen(Dst, B, DL, TLI);
+  Value *DstLen = emitStrLen(Dst, B, DL, TLI);
   if (!DstLen)
     return nullptr;
 
@@ -238,15 +147,6 @@ Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len,
 }
 
 Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  // Verify the "strncat" function prototype.
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 3 || FT->getReturnType() != B.getInt8PtrTy() ||
-      FT->getParamType(0) != FT->getReturnType() ||
-      FT->getParamType(1) != FT->getReturnType() ||
-      !FT->getParamType(2)->isIntegerTy())
-    return nullptr;
-
   // Extract some information from the instruction.
   Value *Dst = CI->getArgOperand(0);
   Value *Src = CI->getArgOperand(1);
@@ -281,13 +181,7 @@ Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilder<> &B) {
 
 Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  // Verify the "strchr" function prototype.
   FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 2 || FT->getReturnType() != B.getInt8PtrTy() ||
-      FT->getParamType(0) != FT->getReturnType() ||
-      !FT->getParamType(1)->isIntegerTy(32))
-    return nullptr;
-
   Value *SrcStr = CI->getArgOperand(0);
 
   // If the second operand is non-constant, see if we can compute the length
@@ -298,7 +192,7 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) {
     if (Len == 0 || !FT->getParamType(1)->isIntegerTy(32)) // memchr needs i32.
       return nullptr;
 
-    return EmitMemChr(SrcStr, CI->getArgOperand(1), // include nul.
+    return emitMemChr(SrcStr, CI->getArgOperand(1), // include nul.
                       ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len),
                       B, DL, TLI);
   }
@@ -308,7 +202,7 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) {
   StringRef Str;
   if (!getConstantStringInfo(SrcStr, Str)) {
     if (CharC->isZero()) // strchr(p, 0) -> p + strlen(p)
-      return B.CreateGEP(B.getInt8Ty(), SrcStr, EmitStrLen(SrcStr, B, DL, TLI),
+      return B.CreateGEP(B.getInt8Ty(), SrcStr, emitStrLen(SrcStr, B, DL, TLI),
                          "strchr");
     return nullptr;
   }
@@ -326,14 +220,6 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  // Verify the "strrchr" function prototype.
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 2 || FT->getReturnType() != B.getInt8PtrTy() ||
-      FT->getParamType(0) != FT->getReturnType() ||
-      !FT->getParamType(1)->isIntegerTy(32))
-    return nullptr;
-
   Value *SrcStr = CI->getArgOperand(0);
   ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
 
@@ -345,7 +231,7 @@ Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilder<> &B) {
   if (!getConstantStringInfo(SrcStr, Str)) {
     // strrchr(s, 0) -> strchr(s, 0)
     if (CharC->isZero())
-      return EmitStrChr(SrcStr, '\0', B, TLI);
+      return emitStrChr(SrcStr, '\0', B, TLI);
     return nullptr;
   }
 
@@ -361,14 +247,6 @@ Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  // Verify the "strcmp" function prototype.
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 2 || !FT->getReturnType()->isIntegerTy(32) ||
-      FT->getParamType(0) != FT->getParamType(1) ||
-      FT->getParamType(0) != B.getInt8PtrTy())
-    return nullptr;
-
   Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
   if (Str1P == Str2P) // strcmp(x,x)  -> 0
     return ConstantInt::get(CI->getType(), 0);
@@ -392,7 +270,7 @@ Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilder<> &B) {
   uint64_t Len1 = GetStringLength(Str1P);
   uint64_t Len2 = GetStringLength(Str2P);
   if (Len1 && Len2) {
-    return EmitMemCmp(Str1P, Str2P,
+    return emitMemCmp(Str1P, Str2P,
                       ConstantInt::get(DL.getIntPtrType(CI->getContext()),
                                        std::min(Len1, Len2)),
                       B, DL, TLI);
@@ -402,15 +280,6 @@ Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  // Verify the "strncmp" function prototype.
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 3 || !FT->getReturnType()->isIntegerTy(32) ||
-      FT->getParamType(0) != FT->getParamType(1) ||
-      FT->getParamType(0) != B.getInt8PtrTy() ||
-      !FT->getParamType(2)->isIntegerTy())
-    return nullptr;
-
   Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
   if (Str1P == Str2P) // strncmp(x,x,n)  -> 0
     return ConstantInt::get(CI->getType(), 0);
@@ -426,7 +295,7 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) {
     return ConstantInt::get(CI->getType(), 0);
 
   if (Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1)
-    return EmitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, DL, TLI);
+    return emitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, DL, TLI);
 
   StringRef Str1, Str2;
   bool HasStr1 = getConstantStringInfo(Str1P, Str1);
@@ -450,11 +319,6 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-
-  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::strcpy))
-    return nullptr;
-
   Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
   if (Dst == Src) // strcpy(x,x)  -> x
     return Src;
@@ -473,12 +337,9 @@ Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilder<> &B) {
 
 Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::stpcpy))
-    return nullptr;
-
   Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
   if (Dst == Src) { // stpcpy(x,x)  -> x+strlen(x)
-    Value *StrLen = EmitStrLen(Src, B, DL, TLI);
+    Value *StrLen = emitStrLen(Src, B, DL, TLI);
     return StrLen ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, StrLen) : nullptr;
   }
 
@@ -500,9 +361,6 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) {
 
 Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::strncpy))
-    return nullptr;
-
   Value *Dst = CI->getArgOperand(0);
   Value *Src = CI->getArgOperand(1);
   Value *LenOp = CI->getArgOperand(2);
@@ -540,18 +398,63 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 1 || FT->getParamType(0) != B.getInt8PtrTy() ||
-      !FT->getReturnType()->isIntegerTy())
-    return nullptr;
-
   Value *Src = CI->getArgOperand(0);
 
   // Constant folding: strlen("xyz") -> 3
   if (uint64_t Len = GetStringLength(Src))
     return ConstantInt::get(CI->getType(), Len - 1);
 
+  // If s is a constant pointer pointing to a string literal, we can fold
+  // strlen(s + x) to strlen(s) - x, when x is known to be in the range 
+  // [0, strlen(s)] or the string has a single null terminator '\0' at the end.
+  // We only try to simplify strlen when the pointer s points to an array 
+  // of i8. Otherwise, we would need to scale the offset x before doing the
+  // subtraction. This will make the optimization more complex, and it's not 
+  // very useful because calling strlen for a pointer of other types is 
+  // very uncommon.
+  if (GEPOperator *GEP = dyn_cast<GEPOperator>(Src)) {
+    if (!isGEPBasedOnPointerToString(GEP))
+      return nullptr;
+
+    StringRef Str;
+    if (getConstantStringInfo(GEP->getOperand(0), Str, 0, false)) {
+      size_t NullTermIdx = Str.find('\0');
+      
+      // If the string does not have '\0', leave it to strlen to compute
+      // its length.
+      if (NullTermIdx == StringRef::npos)
+        return nullptr;
+     
+      Value *Offset = GEP->getOperand(2);
+      unsigned BitWidth = Offset->getType()->getIntegerBitWidth();
+      APInt KnownZero(BitWidth, 0);
+      APInt KnownOne(BitWidth, 0);
+      computeKnownBits(Offset, KnownZero, KnownOne, DL, 0, nullptr, CI, 
+                       nullptr);
+      KnownZero.flipAllBits();
+      size_t ArrSize = 
+             cast<ArrayType>(GEP->getSourceElementType())->getNumElements();
+
+      // KnownZero's bits are flipped, so zeros in KnownZero now represent 
+      // bits known to be zeros in Offset, and ones in KnowZero represent 
+      // bits unknown in Offset. Therefore, Offset is known to be in range
+      // [0, NullTermIdx] when the flipped KnownZero is non-negative and 
+      // unsigned-less-than NullTermIdx.
+      //
+      // If Offset is not provably in the range [0, NullTermIdx], we can still 
+      // optimize if we can prove that the program has undefined behavior when 
+      // Offset is outside that range. That is the case when GEP->getOperand(0) 
+      // is a pointer to an object whose memory extent is NullTermIdx+1.
+      if ((KnownZero.isNonNegative() && KnownZero.ule(NullTermIdx)) || 
+          (GEP->isInBounds() && isa<GlobalVariable>(GEP->getOperand(0)) &&
+           NullTermIdx == ArrSize - 1))
+        return B.CreateSub(ConstantInt::get(CI->getType(), NullTermIdx), 
+                           Offset);
+    }
+
+    return nullptr;
+  }
+
   // strlen(x?"foo":"bars") --> x ? 3 : 4
   if (SelectInst *SI = dyn_cast<SelectInst>(Src)) {
     uint64_t LenTrue = GetStringLength(SI->getTrueValue());
@@ -576,13 +479,6 @@ Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 2 || FT->getParamType(0) != B.getInt8PtrTy() ||
-      FT->getParamType(1) != FT->getParamType(0) ||
-      FT->getReturnType() != FT->getParamType(0))
-    return nullptr;
-
   StringRef S1, S2;
   bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
   bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
@@ -604,19 +500,12 @@ Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilder<> &B) {
 
   // strpbrk(s, "a") -> strchr(s, 'a')
   if (HasS2 && S2.size() == 1)
-    return EmitStrChr(CI->getArgOperand(0), S2[0], B, TLI);
+    return emitStrChr(CI->getArgOperand(0), S2[0], B, TLI);
 
   return nullptr;
 }
 
 Value *LibCallSimplifier::optimizeStrTo(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  if ((FT->getNumParams() != 2 && FT->getNumParams() != 3) ||
-      !FT->getParamType(0)->isPointerTy() ||
-      !FT->getParamType(1)->isPointerTy())
-    return nullptr;
-
   Value *EndPtr = CI->getArgOperand(1);
   if (isa<ConstantPointerNull>(EndPtr)) {
     // With a null EndPtr, this function won't capture the main argument.
@@ -628,13 +517,6 @@ Value *LibCallSimplifier::optimizeStrTo(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeStrSpn(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 2 || FT->getParamType(0) != B.getInt8PtrTy() ||
-      FT->getParamType(1) != FT->getParamType(0) ||
-      !FT->getReturnType()->isIntegerTy())
-    return nullptr;
-
   StringRef S1, S2;
   bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
   bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
@@ -656,13 +538,6 @@ Value *LibCallSimplifier::optimizeStrSpn(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeStrCSpn(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 2 || FT->getParamType(0) != B.getInt8PtrTy() ||
-      FT->getParamType(1) != FT->getParamType(0) ||
-      !FT->getReturnType()->isIntegerTy())
-    return nullptr;
-
   StringRef S1, S2;
   bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
   bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
@@ -681,29 +556,22 @@ Value *LibCallSimplifier::optimizeStrCSpn(CallInst *CI, IRBuilder<> &B) {
 
   // strcspn(s, "") -> strlen(s)
   if (HasS2 && S2.empty())
-    return EmitStrLen(CI->getArgOperand(0), B, DL, TLI);
+    return emitStrLen(CI->getArgOperand(0), B, DL, TLI);
 
   return nullptr;
 }
 
 Value *LibCallSimplifier::optimizeStrStr(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
-      !FT->getParamType(1)->isPointerTy() ||
-      !FT->getReturnType()->isPointerTy())
-    return nullptr;
-
   // fold strstr(x, x) -> x.
   if (CI->getArgOperand(0) == CI->getArgOperand(1))
     return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
 
   // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0
   if (isOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) {
-    Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, DL, TLI);
+    Value *StrLen = emitStrLen(CI->getArgOperand(1), B, DL, TLI);
     if (!StrLen)
       return nullptr;
-    Value *StrNCmp = EmitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1),
+    Value *StrNCmp = emitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1),
                                  StrLen, B, DL, TLI);
     if (!StrNCmp)
       return nullptr;
@@ -734,28 +602,20 @@ Value *LibCallSimplifier::optimizeStrStr(CallInst *CI, IRBuilder<> &B) {
       return Constant::getNullValue(CI->getType());
 
     // strstr("abcd", "bc") -> gep((char*)"abcd", 1)
-    Value *Result = CastToCStr(CI->getArgOperand(0), B);
+    Value *Result = castToCStr(CI->getArgOperand(0), B);
     Result = B.CreateConstInBoundsGEP1_64(Result, Offset, "strstr");
     return B.CreateBitCast(Result, CI->getType());
   }
 
   // fold strstr(x, "y") -> strchr(x, 'y').
   if (HasStr2 && ToFindStr.size() == 1) {
-    Value *StrChr = EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TLI);
+    Value *StrChr = emitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TLI);
     return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : nullptr;
   }
   return nullptr;
 }
 
 Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() ||
-      !FT->getParamType(1)->isIntegerTy(32) ||
-      !FT->getParamType(2)->isIntegerTy() ||
-      !FT->getReturnType()->isPointerTy())
-    return nullptr;
-
   Value *SrcStr = CI->getArgOperand(0);
   ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
   ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
@@ -834,13 +694,6 @@ Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() ||
-      !FT->getParamType(1)->isPointerTy() ||
-      !FT->getReturnType()->isIntegerTy(32))
-    return nullptr;
-
   Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1);
 
   if (LHS == RHS) // memcmp(s,s,x) -> 0
@@ -857,9 +710,9 @@ Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) {
 
   // memcmp(S1,S2,1) -> *(unsigned char*)LHS - *(unsigned char*)RHS
   if (Len == 1) {
-    Value *LHSV = B.CreateZExt(B.CreateLoad(CastToCStr(LHS, B), "lhsc"),
+    Value *LHSV = B.CreateZExt(B.CreateLoad(castToCStr(LHS, B), "lhsc"),
                                CI->getType(), "lhsv");
-    Value *RHSV = B.CreateZExt(B.CreateLoad(CastToCStr(RHS, B), "rhsc"),
+    Value *RHSV = B.CreateZExt(B.CreateLoad(castToCStr(RHS, B), "rhsc"),
                                CI->getType(), "rhsv");
     return B.CreateSub(LHSV, RHSV, "chardiff");
   }
@@ -909,11 +762,6 @@ Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-
-  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memcpy))
-    return nullptr;
-
   // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1)
   B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
                  CI->getArgOperand(2), 1);
@@ -921,23 +769,81 @@ Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-
-  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memmove))
-    return nullptr;
-
   // memmove(x, y, n) -> llvm.memmove(x, y, n, 1)
   B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
                   CI->getArgOperand(2), 1);
   return CI->getArgOperand(0);
 }
 
-Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
+// TODO: Does this belong in BuildLibCalls or should all of those similar
+// functions be moved here?
+static Value *emitCalloc(Value *Num, Value *Size, const AttributeSet &Attrs,
+                         IRBuilder<> &B, const TargetLibraryInfo &TLI) {
+  LibFunc::Func Func;
+  if (!TLI.getLibFunc("calloc", Func) || !TLI.has(Func))
+    return nullptr;
+
+  Module *M = B.GetInsertBlock()->getModule();
+  const DataLayout &DL = M->getDataLayout();
+  IntegerType *PtrType = DL.getIntPtrType((B.GetInsertBlock()->getContext()));
+  Value *Calloc = M->getOrInsertFunction("calloc", Attrs, B.getInt8PtrTy(),
+                                         PtrType, PtrType, nullptr);
+  CallInst *CI = B.CreateCall(Calloc, { Num, Size }, "calloc");
+
+  if (const auto *F = dyn_cast<Function>(Calloc->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+
+  return CI;
+}
 
-  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memset))
+/// Fold memset[_chk](malloc(n), 0, n) --> calloc(1, n).
+static Value *foldMallocMemset(CallInst *Memset, IRBuilder<> &B,
+                               const TargetLibraryInfo &TLI) {
+  // This has to be a memset of zeros (bzero).
+  auto *FillValue = dyn_cast<ConstantInt>(Memset->getArgOperand(1));
+  if (!FillValue || FillValue->getZExtValue() != 0)
     return nullptr;
 
+  // TODO: We should handle the case where the malloc has more than one use.
+  // This is necessary to optimize common patterns such as when the result of
+  // the malloc is checked against null or when a memset intrinsic is used in
+  // place of a memset library call.
+  auto *Malloc = dyn_cast<CallInst>(Memset->getArgOperand(0));
+  if (!Malloc || !Malloc->hasOneUse())
+    return nullptr;
+
+  // Is the inner call really malloc()?
+  Function *InnerCallee = Malloc->getCalledFunction();
+  LibFunc::Func Func;
+  if (!TLI.getLibFunc(*InnerCallee, Func) || !TLI.has(Func) ||
+      Func != LibFunc::malloc)
+    return nullptr;
+
+  // The memset must cover the same number of bytes that are malloc'd.
+  if (Memset->getArgOperand(2) != Malloc->getArgOperand(0))
+    return nullptr;
+
+  // Replace the malloc with a calloc. We need the data layout to know what the
+  // actual size of a 'size_t' parameter is. 
+  B.SetInsertPoint(Malloc->getParent(), ++Malloc->getIterator());
+  const DataLayout &DL = Malloc->getModule()->getDataLayout();
+  IntegerType *SizeType = DL.getIntPtrType(B.GetInsertBlock()->getContext());
+  Value *Calloc = emitCalloc(ConstantInt::get(SizeType, 1),
+                             Malloc->getArgOperand(0), Malloc->getAttributes(),
+                             B, TLI);
+  if (!Calloc)
+    return nullptr;
+
+  Malloc->replaceAllUsesWith(Calloc);
+  Malloc->eraseFromParent();
+
+  return Calloc;
+}
+
+Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilder<> &B) {
+  if (auto *Calloc = foldMallocMemset(CI, B, *TLI))
+    return Calloc;
+
   // memset(p, v, n) -> llvm.memset(p, v, n, 1)
   Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
   B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
@@ -970,34 +876,12 @@ static Value *valueHasFloatPrecision(Value *Val) {
   return nullptr;
 }
 
-/// Any floating-point library function that we're trying to simplify will have
-/// a signature of the form: fptype foo(fptype param1, fptype param2, ...).
-/// CheckDoubleTy indicates that 'fptype' must be 'double'.
-static bool matchesFPLibFunctionSignature(const Function *F, unsigned NumParams,
-                                          bool CheckDoubleTy) {
-  FunctionType *FT = F->getFunctionType();
-  if (FT->getNumParams() != NumParams)
-    return false;
-
-  // The return type must match what we're looking for.
-  Type *RetTy = FT->getReturnType();
-  if (CheckDoubleTy ? !RetTy->isDoubleTy() : !RetTy->isFloatingPointTy())
-    return false;
-
-  // Each parameter must match the return type, and therefore, match every other
-  // parameter too.
-  for (const Type *ParamTy : FT->params())
-    if (ParamTy != RetTy)
-      return false;
-
-  return true;
-}
-
 /// Shrink double -> float for unary functions like 'floor'.
 static Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B,
                                     bool CheckRetType) {
   Function *Callee = CI->getCalledFunction();
-  if (!matchesFPLibFunctionSignature(Callee, 1, true))
+  // We know this libcall has a valid prototype, but we don't know which.
+  if (!CI->getType()->isDoubleTy())
     return nullptr;
 
   if (CheckRetType) {
@@ -1026,7 +910,7 @@ static Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B,
     V = B.CreateCall(F, V);
   } else {
     // The call is a library call rather than an intrinsic.
-    V = EmitUnaryFloatFnCall(V, Callee->getName(), B, Callee->getAttributes());
+    V = emitUnaryFloatFnCall(V, Callee->getName(), B, Callee->getAttributes());
   }
 
   return B.CreateFPExt(V, B.getDoubleTy());
@@ -1035,7 +919,8 @@ static Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B,
 /// Shrink double -> float for binary functions like 'fmin/fmax'.
 static Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  if (!matchesFPLibFunctionSignature(Callee, 2, true))
+  // We know this libcall has a valid prototype, but we don't know which.
+  if (!CI->getType()->isDoubleTy())
     return nullptr;
 
   // If this is something like 'fmin((double)floatval1, (double)floatval2)',
@@ -1054,7 +939,7 @@ static Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B) {
   // fmin((double)floatval1, (double)floatval2)
   //                      -> (double)fminf(floatval1, floatval2)
   // TODO: Handle intrinsics in the same way as in optimizeUnaryDoubleFP().
-  Value *V = EmitBinaryFloatFnCall(V1, V2, Callee->getName(), B,
+  Value *V = emitBinaryFloatFnCall(V1, V2, Callee->getName(), B,
                                    Callee->getAttributes());
   return B.CreateFPExt(V, B.getDoubleTy());
 }
@@ -1066,13 +951,6 @@ Value *LibCallSimplifier::optimizeCos(CallInst *CI, IRBuilder<> &B) {
   if (UnsafeFPShrink && Name == "cos" && hasFloatVersion(Name))
     Ret = optimizeUnaryDoubleFP(CI, B, true);
 
-  FunctionType *FT = Callee->getFunctionType();
-  // Just make sure this has 1 argument of FP type, which matches the
-  // result type.
-  if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
-      !FT->getParamType(0)->isFloatingPointTy())
-    return Ret;
-
   // cos(-x) -> cos(x)
   Value *Op1 = CI->getArgOperand(0);
   if (BinaryOperator::isFNeg(Op1)) {
@@ -1114,14 +992,6 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
   if (UnsafeFPShrink && Name == "pow" && hasFloatVersion(Name))
     Ret = optimizeUnaryDoubleFP(CI, B, true);
 
-  FunctionType *FT = Callee->getFunctionType();
-  // Just make sure this has 2 arguments of the same FP type, which match the
-  // result type.
-  if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
-      FT->getParamType(0) != FT->getParamType(1) ||
-      !FT->getParamType(0)->isFloatingPointTy())
-    return Ret;
-
   Value *Op1 = CI->getArgOperand(0), *Op2 = CI->getArgOperand(1);
   if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
     // pow(1.0, x) -> 1.0
@@ -1131,19 +1001,16 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
     if (Op1C->isExactlyValue(2.0) &&
         hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp2, LibFunc::exp2f,
                         LibFunc::exp2l))
-      return EmitUnaryFloatFnCall(Op2, TLI->getName(LibFunc::exp2), B,
+      return emitUnaryFloatFnCall(Op2, TLI->getName(LibFunc::exp2), B,
                                   Callee->getAttributes());
     // pow(10.0, x) -> exp10(x)
     if (Op1C->isExactlyValue(10.0) &&
         hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp10, LibFunc::exp10f,
                         LibFunc::exp10l))
-      return EmitUnaryFloatFnCall(Op2, TLI->getName(LibFunc::exp10), B,
+      return emitUnaryFloatFnCall(Op2, TLI->getName(LibFunc::exp10), B,
                                   Callee->getAttributes());
   }
 
-  // FIXME: Use instruction-level FMF.
-  bool UnsafeFPMath = canUseUnsafeFPMath(CI->getParent()->getParent());
-
   // pow(exp(x), y) -> exp(x * y)
   // pow(exp2(x), y) -> exp2(x * y)
   // We enable these only with fast-math. Besides rounding differences, the
@@ -1159,7 +1026,7 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
       IRBuilder<>::FastMathFlagGuard Guard(B);
       B.setFastMathFlags(CI->getFastMathFlags());
       Value *FMul = B.CreateFMul(OpC->getArgOperand(0), Op2, "mul");
-      return EmitUnaryFloatFnCall(FMul, OpCCallee->getName(), B,
+      return emitUnaryFloatFnCall(FMul, OpCCallee->getName(), B,
                                   OpCCallee->getAttributes());
     }
   }
@@ -1181,7 +1048,7 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
     if (CI->hasUnsafeAlgebra()) {
       IRBuilder<>::FastMathFlagGuard Guard(B);
       B.setFastMathFlags(CI->getFastMathFlags());
-      return EmitUnaryFloatFnCall(Op1, TLI->getName(LibFunc::sqrt), B,
+      return emitUnaryFloatFnCall(Op1, TLI->getName(LibFunc::sqrt), B,
                                   Callee->getAttributes());
     }
 
@@ -1191,9 +1058,9 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
     // TODO: In finite-only mode, this could be just fabs(sqrt(x)).
     Value *Inf = ConstantFP::getInfinity(CI->getType());
     Value *NegInf = ConstantFP::getInfinity(CI->getType(), true);
-    Value *Sqrt = EmitUnaryFloatFnCall(Op1, "sqrt", B, Callee->getAttributes());
+    Value *Sqrt = emitUnaryFloatFnCall(Op1, "sqrt", B, Callee->getAttributes());
     Value *FAbs =
-        EmitUnaryFloatFnCall(Sqrt, "fabs", B, Callee->getAttributes());
+        emitUnaryFloatFnCall(Sqrt, "fabs", B, Callee->getAttributes());
     Value *FCmp = B.CreateFCmpOEQ(Op1, NegInf);
     Value *Sel = B.CreateSelect(FCmp, Inf, FAbs);
     return Sel;
@@ -1207,7 +1074,7 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
     return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), Op1, "powrecip");
 
   // In -ffast-math, generate repeated fmul instead of generating pow(x, n).
-  if (UnsafeFPMath) {
+  if (CI->hasUnsafeAlgebra()) {
     APFloat V = abs(Op2C->getValueAPF());
     // We limit to a max of 7 fmul(s). Thus max exponent is 32.
     // This transformation applies to integer exponents only.
@@ -1224,6 +1091,8 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
     // So we first convert V to something which could be converted to double.
     bool ignored;
     V.convert(APFloat::IEEEdouble, APFloat::rmTowardZero, &ignored);
+    
+    // TODO: Should the new instructions propagate the 'fast' flag of the pow()?
     Value *FMul = getPow(InnerChain, V.convertToDouble(), B);
     // For negative exponents simply compute the reciprocal.
     if (Op2C->isNegative())
@@ -1236,19 +1105,11 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
 
 Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  Function *Caller = CI->getParent()->getParent();
   Value *Ret = nullptr;
   StringRef Name = Callee->getName();
   if (UnsafeFPShrink && Name == "exp2" && hasFloatVersion(Name))
     Ret = optimizeUnaryDoubleFP(CI, B, true);
 
-  FunctionType *FT = Callee->getFunctionType();
-  // Just make sure this has 1 argument of FP type, which matches the
-  // result type.
-  if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
-      !FT->getParamType(0)->isFloatingPointTy())
-    return Ret;
-
   Value *Op = CI->getArgOperand(0);
   // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x))  if sizeof(x) <= 32
   // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x))  if sizeof(x) < 32
@@ -1273,11 +1134,11 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) {
       if (!Op->getType()->isFloatTy())
         One = ConstantExpr::getFPExtend(One, Op->getType());
 
-      Module *M = Caller->getParent();
-      Value *Callee =
+      Module *M = CI->getModule();
+      Value *NewCallee =
           M->getOrInsertFunction(TLI->getName(LdExp), Op->getType(),
                                  Op->getType(), B.getInt32Ty(), nullptr);
-      CallInst *CI = B.CreateCall(Callee, {One, LdExpArg});
+      CallInst *CI = B.CreateCall(NewCallee, {One, LdExpArg});
       if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts()))
         CI->setCallingConv(F->getCallingConv());
 
@@ -1294,12 +1155,6 @@ Value *LibCallSimplifier::optimizeFabs(CallInst *CI, IRBuilder<> &B) {
   if (Name == "fabs" && hasFloatVersion(Name))
     Ret = optimizeUnaryDoubleFP(CI, B, false);
 
-  FunctionType *FT = Callee->getFunctionType();
-  // Make sure this has 1 argument of FP type which matches the result type.
-  if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
-      !FT->getParamType(0)->isFloatingPointTy())
-    return Ret;
-
   Value *Op = CI->getArgOperand(0);
   if (Instruction *I = dyn_cast<Instruction>(Op)) {
     // Fold fabs(x * x) -> x * x; any squared FP value must already be positive.
@@ -1311,21 +1166,14 @@ Value *LibCallSimplifier::optimizeFabs(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
   // If we can shrink the call to a float function rather than a double
   // function, do that first.
-  Function *Callee = CI->getCalledFunction();
   StringRef Name = Callee->getName();
   if ((Name == "fmin" || Name == "fmax") && hasFloatVersion(Name))
     if (Value *Ret = optimizeBinaryDoubleFP(CI, B))
       return Ret;
 
-  // Make sure this has 2 arguments of FP type which match the result type.
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
-      FT->getParamType(0) != FT->getParamType(1) ||
-      !FT->getParamType(0)->isFloatingPointTy())
-    return nullptr;
-
   IRBuilder<>::FastMathFlagGuard Guard(B);
   FastMathFlags FMF;
   if (CI->hasUnsafeAlgebra()) {
@@ -1360,13 +1208,6 @@ Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) {
   StringRef Name = Callee->getName();
   if (UnsafeFPShrink && hasFloatVersion(Name))
     Ret = optimizeUnaryDoubleFP(CI, B, true);
-  FunctionType *FT = Callee->getFunctionType();
-
-  // Just make sure this has 1 argument of FP type, which matches the
-  // result type.
-  if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
-      !FT->getParamType(0)->isFloatingPointTy())
-    return Ret;
 
   if (!CI->hasUnsafeAlgebra())
     return Ret;
@@ -1392,7 +1233,7 @@ Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) {
   if (F && ((TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) &&
       Func == LibFunc::pow) || F->getIntrinsicID() == Intrinsic::pow))
     return B.CreateFMul(OpC->getArgOperand(1),
-      EmitUnaryFloatFnCall(OpC->getOperand(0), Callee->getName(), B,
+      emitUnaryFloatFnCall(OpC->getOperand(0), Callee->getName(), B,
                            Callee->getAttributes()), "mul");
 
   // log(exp2(y)) -> y*log(2)
@@ -1400,7 +1241,7 @@ Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) {
       TLI->has(Func) && Func == LibFunc::exp2)
     return B.CreateFMul(
         OpC->getArgOperand(0),
-        EmitUnaryFloatFnCall(ConstantFP::get(CI->getType(), 2.0),
+        emitUnaryFloatFnCall(ConstantFP::get(CI->getType(), 2.0),
                              Callee->getName(), B, Callee->getAttributes()),
         "logmul");
   return Ret;
@@ -1408,21 +1249,11 @@ Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) {
 
 Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-
   Value *Ret = nullptr;
   if (TLI->has(LibFunc::sqrtf) && (Callee->getName() == "sqrt" ||
                                    Callee->getIntrinsicID() == Intrinsic::sqrt))
     Ret = optimizeUnaryDoubleFP(CI, B, true);
 
-  // FIXME: Refactor - this check is repeated all over this file and even in the
-  // preceding call to shrink double -> float.
-
-  // Make sure this has 1 argument of FP type, which matches the result type.
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
-      !FT->getParamType(0)->isFloatingPointTy())
-    return Ret;
-
   if (!CI->hasUnsafeAlgebra())
     return Ret;
 
@@ -1489,13 +1320,6 @@ Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilder<> &B) {
   StringRef Name = Callee->getName();
   if (UnsafeFPShrink && Name == "tan" && hasFloatVersion(Name))
     Ret = optimizeUnaryDoubleFP(CI, B, true);
-  FunctionType *FT = Callee->getFunctionType();
-
-  // Just make sure this has 1 argument of FP type, which matches the
-  // result type.
-  if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
-      !FT->getParamType(0)->isFloatingPointTy())
-    return Ret;
 
   Value *Op1 = CI->getArgOperand(0);
   auto *OpC = dyn_cast<CallInst>(Op1);
@@ -1519,13 +1343,65 @@ Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilder<> &B) {
   return Ret;
 }
 
-static bool isTrigLibCall(CallInst *CI);
+static bool isTrigLibCall(CallInst *CI) {
+  // We can only hope to do anything useful if we can ignore things like errno
+  // and floating-point exceptions.
+  // We already checked the prototype.
+  return CI->hasFnAttr(Attribute::NoUnwind) &&
+         CI->hasFnAttr(Attribute::ReadNone);
+}
+
 static void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,
                              bool UseFloat, Value *&Sin, Value *&Cos,
-                             Value *&SinCos);
+                             Value *&SinCos) {
+  Type *ArgTy = Arg->getType();
+  Type *ResTy;
+  StringRef Name;
 
-Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilder<> &B) {
+  Triple T(OrigCallee->getParent()->getTargetTriple());
+  if (UseFloat) {
+    Name = "__sincospif_stret";
+
+    assert(T.getArch() != Triple::x86 && "x86 messy and unsupported for now");
+    // x86_64 can't use {float, float} since that would be returned in both
+    // xmm0 and xmm1, which isn't what a real struct would do.
+    ResTy = T.getArch() == Triple::x86_64
+    ? static_cast<Type *>(VectorType::get(ArgTy, 2))
+    : static_cast<Type *>(StructType::get(ArgTy, ArgTy, nullptr));
+  } else {
+    Name = "__sincospi_stret";
+    ResTy = StructType::get(ArgTy, ArgTy, nullptr);
+  }
+
+  Module *M = OrigCallee->getParent();
+  Value *Callee = M->getOrInsertFunction(Name, OrigCallee->getAttributes(),
+                                         ResTy, ArgTy, nullptr);
+
+  if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
+    // If the argument is an instruction, it must dominate all uses so put our
+    // sincos call there.
+    B.SetInsertPoint(ArgInst->getParent(), ++ArgInst->getIterator());
+  } else {
+    // Otherwise (e.g. for a constant) the beginning of the function is as
+    // good a place as any.
+    BasicBlock &EntryBB = B.GetInsertBlock()->getParent()->getEntryBlock();
+    B.SetInsertPoint(&EntryBB, EntryBB.begin());
+  }
+
+  SinCos = B.CreateCall(Callee, Arg, "sincospi");
+
+  if (SinCos->getType()->isStructTy()) {
+    Sin = B.CreateExtractValue(SinCos, 0, "sinpi");
+    Cos = B.CreateExtractValue(SinCos, 1, "cospi");
+  } else {
+    Sin = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 0),
+                                 "sinpi");
+    Cos = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 1),
+                                 "cospi");
+  }
+}
 
+Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilder<> &B) {
   // Make sure the prototype is as expected, otherwise the rest of the
   // function is probably invalid and likely to abort.
   if (!isTrigLibCall(CI))
@@ -1541,9 +1417,9 @@ Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilder<> &B) {
   // Look for all compatible sinpi, cospi and sincospi calls with the same
   // argument. If there are enough (in some sense) we can make the
   // substitution.
+  Function *F = CI->getFunction();
   for (User *U : Arg->users())
-    classifyArgUse(U, CI->getParent(), IsFloat, SinCalls, CosCalls,
-                   SinCosCalls);
+    classifyArgUse(U, F, IsFloat, SinCalls, CosCalls, SinCosCalls);
 
   // It's only worthwhile if both sinpi and cospi are actually used.
   if (SinCosCalls.empty() && (SinCalls.empty() || CosCalls.empty()))
@@ -1559,35 +1435,23 @@ Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilder<> &B) {
   return nullptr;
 }
 
-static bool isTrigLibCall(CallInst *CI) {
-  Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-
-  // We can only hope to do anything useful if we can ignore things like errno
-  // and floating-point exceptions.
-  bool AttributesSafe =
-      CI->hasFnAttr(Attribute::NoUnwind) && CI->hasFnAttr(Attribute::ReadNone);
-
-  // Other than that we need float(float) or double(double)
-  return AttributesSafe && FT->getNumParams() == 1 &&
-         FT->getReturnType() == FT->getParamType(0) &&
-         (FT->getParamType(0)->isFloatTy() ||
-          FT->getParamType(0)->isDoubleTy());
-}
-
-void
-LibCallSimplifier::classifyArgUse(Value *Val, BasicBlock *BB, bool IsFloat,
-                                  SmallVectorImpl<CallInst *> &SinCalls,
-                                  SmallVectorImpl<CallInst *> &CosCalls,
-                                  SmallVectorImpl<CallInst *> &SinCosCalls) {
+void LibCallSimplifier::classifyArgUse(
+    Value *Val, Function *F, bool IsFloat,
+    SmallVectorImpl<CallInst *> &SinCalls,
+    SmallVectorImpl<CallInst *> &CosCalls,
+    SmallVectorImpl<CallInst *> &SinCosCalls) {
   CallInst *CI = dyn_cast<CallInst>(Val);
 
   if (!CI)
     return;
 
+  // Don't consider calls in other functions.
+  if (CI->getFunction() != F)
+    return;
+
   Function *Callee = CI->getCalledFunction();
   LibFunc::Func Func;
-  if (!Callee || !TLI->getLibFunc(Callee->getName(), Func) || !TLI->has(Func) ||
+  if (!Callee || !TLI->getLibFunc(*Callee, Func) || !TLI->has(Func) ||
       !isTrigLibCall(CI))
     return;
 
@@ -1614,69 +1478,12 @@ void LibCallSimplifier::replaceTrigInsts(SmallVectorImpl<CallInst *> &Calls,
     replaceAllUsesWith(C, Res);
 }
 
-void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,
-                      bool UseFloat, Value *&Sin, Value *&Cos, Value *&SinCos) {
-  Type *ArgTy = Arg->getType();
-  Type *ResTy;
-  StringRef Name;
-
-  Triple T(OrigCallee->getParent()->getTargetTriple());
-  if (UseFloat) {
-    Name = "__sincospif_stret";
-
-    assert(T.getArch() != Triple::x86 && "x86 messy and unsupported for now");
-    // x86_64 can't use {float, float} since that would be returned in both
-    // xmm0 and xmm1, which isn't what a real struct would do.
-    ResTy = T.getArch() == Triple::x86_64
-                ? static_cast<Type *>(VectorType::get(ArgTy, 2))
-                : static_cast<Type *>(StructType::get(ArgTy, ArgTy, nullptr));
-  } else {
-    Name = "__sincospi_stret";
-    ResTy = StructType::get(ArgTy, ArgTy, nullptr);
-  }
-
-  Module *M = OrigCallee->getParent();
-  Value *Callee = M->getOrInsertFunction(Name, OrigCallee->getAttributes(),
-                                         ResTy, ArgTy, nullptr);
-
-  if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
-    // If the argument is an instruction, it must dominate all uses so put our
-    // sincos call there.
-    B.SetInsertPoint(ArgInst->getParent(), ++ArgInst->getIterator());
-  } else {
-    // Otherwise (e.g. for a constant) the beginning of the function is as
-    // good a place as any.
-    BasicBlock &EntryBB = B.GetInsertBlock()->getParent()->getEntryBlock();
-    B.SetInsertPoint(&EntryBB, EntryBB.begin());
-  }
-
-  SinCos = B.CreateCall(Callee, Arg, "sincospi");
-
-  if (SinCos->getType()->isStructTy()) {
-    Sin = B.CreateExtractValue(SinCos, 0, "sinpi");
-    Cos = B.CreateExtractValue(SinCos, 1, "cospi");
-  } else {
-    Sin = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 0),
-                                 "sinpi");
-    Cos = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 1),
-                                 "cospi");
-  }
-}
-
 //===----------------------------------------------------------------------===//
 // Integer Library Call Optimizations
 //===----------------------------------------------------------------------===//
 
-static bool checkIntUnaryReturnAndParam(Function *Callee) {
-  FunctionType *FT = Callee->getFunctionType();
-  return FT->getNumParams() == 1 && FT->getReturnType()->isIntegerTy(32) &&
-    FT->getParamType(0)->isIntegerTy();
-}
-
 Value *LibCallSimplifier::optimizeFFS(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  if (!checkIntUnaryReturnAndParam(Callee))
-    return nullptr;
   Value *Op = CI->getArgOperand(0);
 
   // Constant fold.
@@ -1700,13 +1507,6 @@ Value *LibCallSimplifier::optimizeFFS(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeAbs(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  FunctionType *FT = Callee->getFunctionType();
-  // We require integer(integer) where the types agree.
-  if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
-      FT->getParamType(0) != FT->getReturnType())
-    return nullptr;
-
   // abs(x) -> x >s -1 ? x : -x
   Value *Op = CI->getArgOperand(0);
   Value *Pos =
@@ -1716,9 +1516,6 @@ Value *LibCallSimplifier::optimizeAbs(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeIsDigit(CallInst *CI, IRBuilder<> &B) {
-  if (!checkIntUnaryReturnAndParam(CI->getCalledFunction()))
-    return nullptr;
-
   // isdigit(c) -> (c-'0') <u 10
   Value *Op = CI->getArgOperand(0);
   Op = B.CreateSub(Op, B.getInt32('0'), "isdigittmp");
@@ -1727,9 +1524,6 @@ Value *LibCallSimplifier::optimizeIsDigit(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeIsAscii(CallInst *CI, IRBuilder<> &B) {
-  if (!checkIntUnaryReturnAndParam(CI->getCalledFunction()))
-    return nullptr;
-
   // isascii(c) -> c <u 128
   Value *Op = CI->getArgOperand(0);
   Op = B.CreateICmpULT(Op, B.getInt32(128), "isascii");
@@ -1737,9 +1531,6 @@ Value *LibCallSimplifier::optimizeIsAscii(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::optimizeToAscii(CallInst *CI, IRBuilder<> &B) {
-  if (!checkIntUnaryReturnAndParam(CI->getCalledFunction()))
-    return nullptr;
-
   // toascii(c) -> c & 0x7f
   return B.CreateAnd(CI->getArgOperand(0),
                      ConstantInt::get(CI->getType(), 0x7F));
@@ -1753,6 +1544,7 @@ static bool isReportingError(Function *Callee, CallInst *CI, int StreamArg);
 
 Value *LibCallSimplifier::optimizeErrorReporting(CallInst *CI, IRBuilder<> &B,
                                                  int StreamArg) {
+  Function *Callee = CI->getCalledFunction();
   // Error reporting calls should be cold, mark them as such.
   // This applies even to non-builtin calls: it is only a hint and applies to
   // functions that the frontend might not understand as builtins.
@@ -1761,8 +1553,6 @@ Value *LibCallSimplifier::optimizeErrorReporting(CallInst *CI, IRBuilder<> &B,
   // Improving Static Branch Prediction in a Compiler
   // Brian L. Deitrich, Ben-Chung Cheng, Wen-mei W. Hwu
   // Proceedings of PACT'98, Oct. 1998, IEEE
-  Function *Callee = CI->getCalledFunction();
-
   if (!CI->hasFnAttr(Attribute::Cold) &&
       isReportingError(Callee, CI, StreamArg)) {
     CI->addAttribute(AttributeSet::FunctionIndex, Attribute::Cold);
@@ -1808,12 +1598,18 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilder<> &B) {
   if (!CI->use_empty())
     return nullptr;
 
-  // printf("x") -> putchar('x'), even for '%'.
-  if (FormatStr.size() == 1) {
-    Value *Res = EmitPutChar(B.getInt32(FormatStr[0]), B, TLI);
-    if (CI->use_empty() || !Res)
-      return Res;
-    return B.CreateIntCast(Res, CI->getType(), true);
+  // printf("x") -> putchar('x'), even for "%" and "%%".
+  if (FormatStr.size() == 1 || FormatStr == "%%")
+    return emitPutChar(B.getInt32(FormatStr[0]), B, TLI);
+
+  // printf("%s", "a") --> putchar('a')
+  if (FormatStr == "%s" && CI->getNumArgOperands() > 1) {
+    StringRef ChrStr;
+    if (!getConstantStringInfo(CI->getOperand(1), ChrStr))
+      return nullptr;
+    if (ChrStr.size() != 1)
+      return nullptr;
+    return emitPutChar(B.getInt32(ChrStr[0]), B, TLI);
   }
 
   // printf("foo\n") --> puts("foo")
@@ -1823,40 +1619,26 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilder<> &B) {
     // pass to be run after this pass, to merge duplicate strings.
     FormatStr = FormatStr.drop_back();
     Value *GV = B.CreateGlobalString(FormatStr, "str");
-    Value *NewCI = EmitPutS(GV, B, TLI);
-    return (CI->use_empty() || !NewCI)
-               ? NewCI
-               : ConstantInt::get(CI->getType(), FormatStr.size() + 1);
+    return emitPutS(GV, B, TLI);
   }
 
   // Optimize specific format strings.
   // printf("%c", chr) --> putchar(chr)
   if (FormatStr == "%c" && CI->getNumArgOperands() > 1 &&
-      CI->getArgOperand(1)->getType()->isIntegerTy()) {
-    Value *Res = EmitPutChar(CI->getArgOperand(1), B, TLI);
-
-    if (CI->use_empty() || !Res)
-      return Res;
-    return B.CreateIntCast(Res, CI->getType(), true);
-  }
+      CI->getArgOperand(1)->getType()->isIntegerTy())
+    return emitPutChar(CI->getArgOperand(1), B, TLI);
 
   // printf("%s\n", str) --> puts(str)
   if (FormatStr == "%s\n" && CI->getNumArgOperands() > 1 &&
-      CI->getArgOperand(1)->getType()->isPointerTy()) {
-    return EmitPutS(CI->getArgOperand(1), B, TLI);
-  }
+      CI->getArgOperand(1)->getType()->isPointerTy())
+    return emitPutS(CI->getArgOperand(1), B, TLI);
   return nullptr;
 }
 
 Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilder<> &B) {
 
   Function *Callee = CI->getCalledFunction();
-  // Require one fixed pointer argument and an integer/void result.
   FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() ||
-      !(FT->getReturnType()->isIntegerTy() || FT->getReturnType()->isVoidTy()))
-    return nullptr;
-
   if (Value *V = optimizePrintFString(CI, B)) {
     return V;
   }
@@ -1909,7 +1691,7 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) {
     if (!CI->getArgOperand(2)->getType()->isIntegerTy())
       return nullptr;
     Value *V = B.CreateTrunc(CI->getArgOperand(2), B.getInt8Ty(), "char");
-    Value *Ptr = CastToCStr(CI->getArgOperand(0), B);
+    Value *Ptr = castToCStr(CI->getArgOperand(0), B);
     B.CreateStore(V, Ptr);
     Ptr = B.CreateGEP(B.getInt8Ty(), Ptr, B.getInt32(1), "nul");
     B.CreateStore(B.getInt8(0), Ptr);
@@ -1922,7 +1704,7 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) {
     if (!CI->getArgOperand(2)->getType()->isPointerTy())
       return nullptr;
 
-    Value *Len = EmitStrLen(CI->getArgOperand(2), B, DL, TLI);
+    Value *Len = emitStrLen(CI->getArgOperand(2), B, DL, TLI);
     if (!Len)
       return nullptr;
     Value *IncLen =
@@ -1937,13 +1719,7 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) {
 
 Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  // Require two fixed pointer arguments and an integer result.
   FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
-      !FT->getParamType(1)->isPointerTy() ||
-      !FT->getReturnType()->isIntegerTy())
-    return nullptr;
-
   if (Value *V = optimizeSPrintFString(CI, B)) {
     return V;
   }
@@ -1982,7 +1758,7 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, IRBuilder<> &B) {
       if (FormatStr[i] == '%') // Could handle %% -> % if we cared.
         return nullptr;        // We found a format specifier.
 
-    return EmitFWrite(
+    return emitFWrite(
         CI->getArgOperand(1),
         ConstantInt::get(DL.getIntPtrType(CI->getContext()), FormatStr.size()),
         CI->getArgOperand(0), B, DL, TLI);
@@ -1999,27 +1775,21 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, IRBuilder<> &B) {
     // fprintf(F, "%c", chr) --> fputc(chr, F)
     if (!CI->getArgOperand(2)->getType()->isIntegerTy())
       return nullptr;
-    return EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI);
+    return emitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI);
   }
 
   if (FormatStr[1] == 's') {
     // fprintf(F, "%s", str) --> fputs(str, F)
     if (!CI->getArgOperand(2)->getType()->isPointerTy())
       return nullptr;
-    return EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI);
+    return emitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI);
   }
   return nullptr;
 }
 
 Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  // Require two fixed paramters as pointers and integer result.
   FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
-      !FT->getParamType(1)->isPointerTy() ||
-      !FT->getReturnType()->isIntegerTy())
-    return nullptr;
-
   if (Value *V = optimizeFPrintFString(CI, B)) {
     return V;
   }
@@ -2041,16 +1811,6 @@ Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilder<> &B) {
 Value *LibCallSimplifier::optimizeFWrite(CallInst *CI, IRBuilder<> &B) {
   optimizeErrorReporting(CI, B, 3);
 
-  Function *Callee = CI->getCalledFunction();
-  // Require a pointer, an integer, an integer, a pointer, returning integer.
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 4 || !FT->getParamType(0)->isPointerTy() ||
-      !FT->getParamType(1)->isIntegerTy() ||
-      !FT->getParamType(2)->isIntegerTy() ||
-      !FT->getParamType(3)->isPointerTy() ||
-      !FT->getReturnType()->isIntegerTy())
-    return nullptr;
-
   // Get the element size and count.
   ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
   ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
@@ -2065,8 +1825,8 @@ Value *LibCallSimplifier::optimizeFWrite(CallInst *CI, IRBuilder<> &B) {
   // If this is writing one byte, turn it into fputc.
   // This optimisation is only valid, if the return value is unused.
   if (Bytes == 1 && CI->use_empty()) { // fwrite(S,1,1,F) -> fputc(S[0],F)
-    Value *Char = B.CreateLoad(CastToCStr(CI->getArgOperand(0), B), "char");
-    Value *NewCI = EmitFPutC(Char, CI->getArgOperand(3), B, TLI);
+    Value *Char = B.CreateLoad(castToCStr(CI->getArgOperand(0), B), "char");
+    Value *NewCI = emitFPutC(Char, CI->getArgOperand(3), B, TLI);
     return NewCI ? ConstantInt::get(CI->getType(), 1) : nullptr;
   }
 
@@ -2076,12 +1836,13 @@ Value *LibCallSimplifier::optimizeFWrite(CallInst *CI, IRBuilder<> &B) {
 Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) {
   optimizeErrorReporting(CI, B, 1);
 
-  Function *Callee = CI->getCalledFunction();
+  // Don't rewrite fputs to fwrite when optimising for size because fwrite
+  // requires more arguments and thus extra MOVs are required.
+  if (CI->getParent()->getParent()->optForSize())
+    return nullptr;
 
-  // Require two pointers.  Also, we can't optimize if return value is used.
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
-      !FT->getParamType(1)->isPointerTy() || !CI->use_empty())
+  // We can't optimize if return value is used.
+  if (!CI->use_empty())
     return nullptr;
 
   // fputs(s,F) --> fwrite(s,1,strlen(s),F)
@@ -2090,20 +1851,13 @@ Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) {
     return nullptr;
 
   // Known to have no uses (see above).
-  return EmitFWrite(
+  return emitFWrite(
       CI->getArgOperand(0),
       ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len - 1),
       CI->getArgOperand(1), B, DL, TLI);
 }
 
 Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  // Require one fixed pointer argument and an integer/void result.
-  FunctionType *FT = Callee->getFunctionType();
-  if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() ||
-      !(FT->getReturnType()->isIntegerTy() || FT->getReturnType()->isVoidTy()))
-    return nullptr;
-
   // Check for a constant string.
   StringRef Str;
   if (!getConstantStringInfo(CI->getArgOperand(0), Str))
@@ -2111,7 +1865,7 @@ Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilder<> &B) {
 
   if (Str.empty() && CI->use_empty()) {
     // puts("") -> putchar('\n')
-    Value *Res = EmitPutChar(B.getInt32('\n'), B, TLI);
+    Value *Res = emitPutChar(B.getInt32('\n'), B, TLI);
     if (CI->use_empty() || !Res)
       return Res;
     return B.CreateIntCast(Res, CI->getType(), true);
@@ -2133,10 +1887,8 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
                                                       IRBuilder<> &Builder) {
   LibFunc::Func Func;
   Function *Callee = CI->getCalledFunction();
-  StringRef FuncName = Callee->getName();
-
   // Check for string/memory library functions.
-  if (TLI->getLibFunc(FuncName, Func) && TLI->has(Func)) {
+  if (TLI->getLibFunc(*Callee, Func) && TLI->has(Func)) {
     // Make sure we never change the calling convention.
     assert((ignoreCallingConv(Func) ||
             CI->getCallingConv() == llvm::CallingConv::C) &&
@@ -2208,10 +1960,10 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
   IRBuilder<> Builder(CI, /*FPMathTag=*/nullptr, OpBundles);
   bool isCallingConvC = CI->getCallingConv() == llvm::CallingConv::C;
 
-  // Command-line parameter overrides function attribute.
+  // Command-line parameter overrides instruction attribute.
   if (EnableUnsafeFPShrink.getNumOccurrences() > 0)
     UnsafeFPShrink = EnableUnsafeFPShrink;
-  else if (canUseUnsafeFPMath(Callee))
+  else if (isa<FPMathOperator>(CI) && CI->hasUnsafeAlgebra())
     UnsafeFPShrink = true;
 
   // First, check for intrinsics.
@@ -2229,6 +1981,7 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
       return optimizeLog(CI, Builder);
     case Intrinsic::sqrt:
       return optimizeSqrt(CI, Builder);
+    // TODO: Use foldMallocMemset() with memset intrinsic.
     default:
       return nullptr;
     }
@@ -2253,7 +2006,7 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
   }
 
   // Then check for known library functions.
-  if (TLI->getLibFunc(FuncName, Func) && TLI->has(Func)) {
+  if (TLI->getLibFunc(*Callee, Func) && TLI->has(Func)) {
     // We never change the calling convention.
     if (!ignoreCallingConv(Func) && !isCallingConvC)
       return nullptr;
@@ -2457,11 +2210,6 @@ bool FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI,
 
 Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI,
                                                      IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-
-  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memcpy_chk))
-    return nullptr;
-
   if (isFortifiedCallFoldable(CI, 3, 2, false)) {
     B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
                    CI->getArgOperand(2), 1);
@@ -2472,11 +2220,6 @@ Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI,
 
 Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI,
                                                       IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-
-  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memmove_chk))
-    return nullptr;
-
   if (isFortifiedCallFoldable(CI, 3, 2, false)) {
     B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
                     CI->getArgOperand(2), 1);
@@ -2487,10 +2230,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI,
 
 Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI,
                                                      IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-
-  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memset_chk))
-    return nullptr;
+  // TODO: Try foldMallocMemset() here.
 
   if (isFortifiedCallFoldable(CI, 3, 2, false)) {
     Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
@@ -2506,16 +2246,12 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
   Function *Callee = CI->getCalledFunction();
   StringRef Name = Callee->getName();
   const DataLayout &DL = CI->getModule()->getDataLayout();
-
-  if (!checkStringCopyLibFuncSignature(Callee, Func))
-    return nullptr;
-
   Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1),
         *ObjSize = CI->getArgOperand(2);
 
   // __stpcpy_chk(x,x,...)  -> x+strlen(x)
   if (Func == LibFunc::stpcpy_chk && !OnlyLowerUnknownSize && Dst == Src) {
-    Value *StrLen = EmitStrLen(Src, B, DL, TLI);
+    Value *StrLen = emitStrLen(Src, B, DL, TLI);
     return StrLen ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, StrLen) : nullptr;
   }
 
@@ -2525,7 +2261,7 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
   // TODO: It might be nice to get a maximum length out of the possible
   // string lengths for varying.
   if (isFortifiedCallFoldable(CI, 2, 1, true))
-    return EmitStrCpy(Dst, Src, B, TLI, Name.substr(2, 6));
+    return emitStrCpy(Dst, Src, B, TLI, Name.substr(2, 6));
 
   if (OnlyLowerUnknownSize)
     return nullptr;
@@ -2537,7 +2273,7 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
 
   Type *SizeTTy = DL.getIntPtrType(CI->getContext());
   Value *LenV = ConstantInt::get(SizeTTy, Len);
-  Value *Ret = EmitMemCpyChk(Dst, Src, LenV, ObjSize, B, DL, TLI);
+  Value *Ret = emitMemCpyChk(Dst, Src, LenV, ObjSize, B, DL, TLI);
   // If the function was an __stpcpy_chk, and we were able to fold it into
   // a __memcpy_chk, we still need to return the correct end pointer.
   if (Ret && Func == LibFunc::stpcpy_chk)
@@ -2550,11 +2286,8 @@ Value *FortifiedLibCallSimplifier::optimizeStrpNCpyChk(CallInst *CI,
                                                        LibFunc::Func Func) {
   Function *Callee = CI->getCalledFunction();
   StringRef Name = Callee->getName();
-
-  if (!checkStringCopyLibFuncSignature(Callee, Func))
-    return nullptr;
   if (isFortifiedCallFoldable(CI, 3, 2, false)) {
-    Value *Ret = EmitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+    Value *Ret = emitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
                              CI->getArgOperand(2), B, TLI, Name.substr(2, 7));
     return Ret;
   }
@@ -2577,15 +2310,15 @@ Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI) {
 
   LibFunc::Func Func;
   Function *Callee = CI->getCalledFunction();
-  StringRef FuncName = Callee->getName();
 
   SmallVector<OperandBundleDef, 2> OpBundles;
   CI->getOperandBundlesAsDefs(OpBundles);
   IRBuilder<> Builder(CI, /*FPMathTag=*/nullptr, OpBundles);
   bool isCallingConvC = CI->getCallingConv() == llvm::CallingConv::C;
 
-  // First, check that this is a known library functions.
-  if (!TLI->getLibFunc(FuncName, Func))
+  // First, check that this is a known library functions and that the prototype
+  // is correct.
+  if (!TLI->getLibFunc(*Callee, Func))
     return nullptr;
 
   // We never change the calling convention.
diff --git a/lib/Transforms/Utils/SplitModule.cpp b/lib/Transforms/Utils/SplitModule.cpp
index ad6b782caf8b5..e9a368f4faa4e 100644
--- a/lib/Transforms/Utils/SplitModule.cpp
+++ b/lib/Transforms/Utils/SplitModule.cpp
@@ -13,19 +13,184 @@
 //
 //===----------------------------------------------------------------------===//
 
+#define DEBUG_TYPE "split-module"
+
 #include "llvm/Transforms/Utils/SplitModule.h"
+#include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalObject.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include <queue>
 
 using namespace llvm;
 
+namespace {
+typedef EquivalenceClasses<const GlobalValue *> ClusterMapType;
+typedef DenseMap<const Comdat *, const GlobalValue *> ComdatMembersType;
+typedef DenseMap<const GlobalValue *, unsigned> ClusterIDMapType;
+}
+
+static void addNonConstUser(ClusterMapType &GVtoClusterMap,
+                            const GlobalValue *GV, const User *U) {
+  assert((!isa<Constant>(U) || isa<GlobalValue>(U)) && "Bad user");
+
+  if (const Instruction *I = dyn_cast<Instruction>(U)) {
+    const GlobalValue *F = I->getParent()->getParent();
+    GVtoClusterMap.unionSets(GV, F);
+  } else if (isa<GlobalIndirectSymbol>(U) || isa<Function>(U) ||
+             isa<GlobalVariable>(U)) {
+    GVtoClusterMap.unionSets(GV, cast<GlobalValue>(U));
+  } else {
+    llvm_unreachable("Underimplemented use case");
+  }
+}
+
+// Adds all GlobalValue users of V to the same cluster as GV.
+static void addAllGlobalValueUsers(ClusterMapType &GVtoClusterMap,
+                                   const GlobalValue *GV, const Value *V) {
+  for (auto *U : V->users()) {
+    SmallVector<const User *, 4> Worklist;
+    Worklist.push_back(U);
+    while (!Worklist.empty()) {
+      const User *UU = Worklist.pop_back_val();
+      // For each constant that is not a GV (a pure const) recurse.
+      if (isa<Constant>(UU) && !isa<GlobalValue>(UU)) {
+        Worklist.append(UU->user_begin(), UU->user_end());
+        continue;
+      }
+      addNonConstUser(GVtoClusterMap, GV, UU);
+    }
+  }
+}
+
+// Find partitions for module in the way that no locals need to be
+// globalized.
+// Try to balance pack those partitions into N files since this roughly equals
+// thread balancing for the backend codegen step.
+static void findPartitions(Module *M, ClusterIDMapType &ClusterIDMap,
+                           unsigned N) {
+  // At this point module should have the proper mix of globals and locals.
+  // As we attempt to partition this module, we must not change any
+  // locals to globals.
+  DEBUG(dbgs() << "Partition module with (" << M->size() << ")functions\n");
+  ClusterMapType GVtoClusterMap;
+  ComdatMembersType ComdatMembers;
+
+  auto recordGVSet = [&GVtoClusterMap, &ComdatMembers](GlobalValue &GV) {
+    if (GV.isDeclaration())
+      return;
+
+    if (!GV.hasName())
+      GV.setName("__llvmsplit_unnamed");
+
+    // Comdat groups must not be partitioned. For comdat groups that contain
+    // locals, record all their members here so we can keep them together.
+    // Comdat groups that only contain external globals are already handled by
+    // the MD5-based partitioning.
+    if (const Comdat *C = GV.getComdat()) {
+      auto &Member = ComdatMembers[C];
+      if (Member)
+        GVtoClusterMap.unionSets(Member, &GV);
+      else
+        Member = &GV;
+    }
+
+    // For aliases we should not separate them from their aliasees regardless
+    // of linkage.
+    if (auto *GIS = dyn_cast<GlobalIndirectSymbol>(&GV)) {
+      if (const GlobalObject *Base = GIS->getBaseObject())
+        GVtoClusterMap.unionSets(&GV, Base);
+    }
+
+    if (const Function *F = dyn_cast<Function>(&GV)) {
+      for (const BasicBlock &BB : *F) {
+        BlockAddress *BA = BlockAddress::lookup(&BB);
+        if (!BA || !BA->isConstantUsed())
+          continue;
+        addAllGlobalValueUsers(GVtoClusterMap, F, BA);
+      }
+    }
+
+    if (GV.hasLocalLinkage())
+      addAllGlobalValueUsers(GVtoClusterMap, &GV, &GV);
+  };
+
+  std::for_each(M->begin(), M->end(), recordGVSet);
+  std::for_each(M->global_begin(), M->global_end(), recordGVSet);
+  std::for_each(M->alias_begin(), M->alias_end(), recordGVSet);
+
+  // Assigned all GVs to merged clusters while balancing number of objects in
+  // each.
+  auto CompareClusters = [](const std::pair<unsigned, unsigned> &a,
+                            const std::pair<unsigned, unsigned> &b) {
+    if (a.second || b.second)
+      return a.second > b.second;
+    else
+      return a.first > b.first;
+  };
+
+  std::priority_queue<std::pair<unsigned, unsigned>,
+                      std::vector<std::pair<unsigned, unsigned>>,
+                      decltype(CompareClusters)>
+      BalancinQueue(CompareClusters);
+  // Pre-populate priority queue with N slot blanks.
+  for (unsigned i = 0; i < N; ++i)
+    BalancinQueue.push(std::make_pair(i, 0));
+
+  typedef std::pair<unsigned, ClusterMapType::iterator> SortType;
+  SmallVector<SortType, 64> Sets;
+  SmallPtrSet<const GlobalValue *, 32> Visited;
+
+  // To guarantee determinism, we have to sort SCC according to size.
+  // When size is the same, use leader's name.
+  for (ClusterMapType::iterator I = GVtoClusterMap.begin(),
+                                E = GVtoClusterMap.end(); I != E; ++I)
+    if (I->isLeader())
+      Sets.push_back(
+          std::make_pair(std::distance(GVtoClusterMap.member_begin(I),
+                                       GVtoClusterMap.member_end()), I));
+
+  std::sort(Sets.begin(), Sets.end(), [](const SortType &a, const SortType &b) {
+    if (a.first == b.first)
+      return a.second->getData()->getName() > b.second->getData()->getName();
+    else
+      return a.first > b.first;
+  });
+
+  for (auto &I : Sets) {
+    unsigned CurrentClusterID = BalancinQueue.top().first;
+    unsigned CurrentClusterSize = BalancinQueue.top().second;
+    BalancinQueue.pop();
+
+    DEBUG(dbgs() << "Root[" << CurrentClusterID << "] cluster_size(" << I.first
+                 << ") ----> " << I.second->getData()->getName() << "\n");
+
+    for (ClusterMapType::member_iterator MI =
+             GVtoClusterMap.findLeader(I.second);
+         MI != GVtoClusterMap.member_end(); ++MI) {
+      if (!Visited.insert(*MI).second)
+        continue;
+      DEBUG(dbgs() << "----> " << (*MI)->getName()
+                   << ((*MI)->hasLocalLinkage() ? " l " : " e ") << "\n");
+      Visited.insert(*MI);
+      ClusterIDMap[*MI] = CurrentClusterID;
+      CurrentClusterSize++;
+    }
+    // Add this set size to the number of entries in this cluster.
+    BalancinQueue.push(std::make_pair(CurrentClusterID, CurrentClusterSize));
+  }
+}
+
 static void externalize(GlobalValue *GV) {
   if (GV->hasLocalLinkage()) {
     GV->setLinkage(GlobalValue::ExternalLinkage);
@@ -40,8 +205,8 @@ static void externalize(GlobalValue *GV) {
 
 // Returns whether GV should be in partition (0-based) I of N.
 static bool isInPartition(const GlobalValue *GV, unsigned I, unsigned N) {
-  if (auto GA = dyn_cast<GlobalAlias>(GV))
-    if (const GlobalObject *Base = GA->getBaseObject())
+  if (auto *GIS = dyn_cast<GlobalIndirectSymbol>(GV))
+    if (const GlobalObject *Base = GIS->getBaseObject())
       GV = Base;
 
   StringRef Name;
@@ -62,21 +227,34 @@ static bool isInPartition(const GlobalValue *GV, unsigned I, unsigned N) {
 
 void llvm::SplitModule(
     std::unique_ptr<Module> M, unsigned N,
-    std::function<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
-  for (Function &F : *M)
-    externalize(&F);
-  for (GlobalVariable &GV : M->globals())
-    externalize(&GV);
-  for (GlobalAlias &GA : M->aliases())
-    externalize(&GA);
+    function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback,
+    bool PreserveLocals) {
+  if (!PreserveLocals) {
+    for (Function &F : *M)
+      externalize(&F);
+    for (GlobalVariable &GV : M->globals())
+      externalize(&GV);
+    for (GlobalAlias &GA : M->aliases())
+      externalize(&GA);
+    for (GlobalIFunc &GIF : M->ifuncs())
+      externalize(&GIF);
+  }
+
+  // This performs splitting without a need for externalization, which might not
+  // always be possible.
+  ClusterIDMapType ClusterIDMap;
+  findPartitions(M.get(), ClusterIDMap, N);
 
   // FIXME: We should be able to reuse M as the last partition instead of
   // cloning it.
-  for (unsigned I = 0; I != N; ++I) {
+  for (unsigned I = 0; I < N; ++I) {
     ValueToValueMapTy VMap;
     std::unique_ptr<Module> MPart(
-        CloneModule(M.get(), VMap, [=](const GlobalValue *GV) {
-          return isInPartition(GV, I, N);
+        CloneModule(M.get(), VMap, [&](const GlobalValue *GV) {
+          if (ClusterIDMap.count(GV))
+            return (ClusterIDMap[GV] == I);
+          else
+            return isInPartition(GV, I, N);
         }));
     if (I != 0)
       MPart->setModuleInlineAsm("");
diff --git a/lib/Transforms/Utils/SymbolRewriter.cpp b/lib/Transforms/Utils/SymbolRewriter.cpp
index 1d1f602b041dc..7523ca527b680 100644
--- a/lib/Transforms/Utils/SymbolRewriter.cpp
+++ b/lib/Transforms/Utils/SymbolRewriter.cpp
@@ -58,7 +58,6 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "symbol-rewriter"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/Pass.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/IR/LegacyPassManager.h"
diff --git a/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
index 6b1d1dae5f01b..9385f825523cd 100644
--- a/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
+++ b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
@@ -66,9 +66,7 @@ bool UnifyFunctionExitNodes::runOnFunction(Function &F) {
                                           "UnifiedUnreachableBlock", &F);
     new UnreachableInst(F.getContext(), UnreachableBlock);
 
-    for (std::vector<BasicBlock*>::iterator I = UnreachableBlocks.begin(),
-           E = UnreachableBlocks.end(); I != E; ++I) {
-      BasicBlock *BB = *I;
+    for (BasicBlock *BB : UnreachableBlocks) {
       BB->getInstList().pop_back();  // Remove the unreachable inst.
       BranchInst::Create(UnreachableBlock, BB);
     }
@@ -104,10 +102,7 @@ bool UnifyFunctionExitNodes::runOnFunction(Function &F) {
   // Loop over all of the blocks, replacing the return instruction with an
   // unconditional branch.
   //
-  for (std::vector<BasicBlock*>::iterator I = ReturningBlocks.begin(),
-         E = ReturningBlocks.end(); I != E; ++I) {
-    BasicBlock *BB = *I;
-
+  for (BasicBlock *BB : ReturningBlocks) {
     // Add an incoming element to the PHI node for every return instruction that
     // is merging into this new block...
     if (PN)
diff --git a/lib/Transforms/Utils/Utils.cpp b/lib/Transforms/Utils/Utils.cpp
index ed4f45c6a615d..8f85f19efe38b 100644
--- a/lib/Transforms/Utils/Utils.cpp
+++ b/lib/Transforms/Utils/Utils.cpp
@@ -21,17 +21,20 @@ using namespace llvm;
 /// initializeTransformUtils - Initialize all passes in the TransformUtils
 /// library.
 void llvm::initializeTransformUtils(PassRegistry &Registry) {
-  initializeAddDiscriminatorsPass(Registry);
+  initializeAddDiscriminatorsLegacyPassPass(Registry);
   initializeBreakCriticalEdgesPass(Registry);
   initializeInstNamerPass(Registry);
-  initializeLCSSAPass(Registry);
+  initializeLCSSAWrapperPassPass(Registry);
   initializeLoopSimplifyPass(Registry);
   initializeLowerInvokePass(Registry);
   initializeLowerSwitchPass(Registry);
-  initializePromotePassPass(Registry);
+  initializeNameAnonFunctionPass(Registry);
+  initializePromoteLegacyPassPass(Registry);
   initializeUnifyFunctionExitNodesPass(Registry);
   initializeInstSimplifierPass(Registry);
   initializeMetaRenamerPass(Registry);
+  initializeMemorySSAWrapperPassPass(Registry);
+  initializeMemorySSAPrinterLegacyPassPass(Registry);
 }
 
 /// LLVMInitializeTransformUtils - C binding for initializeTransformUtilsPasses.
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index f47ddb9f064f5..2eade8cbe8efd 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -13,9 +13,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Metadata.h"
@@ -25,25 +29,326 @@ using namespace llvm;
 // Out of line method to get vtable etc for class.
 void ValueMapTypeRemapper::anchor() {}
 void ValueMaterializer::anchor() {}
-void ValueMaterializer::materializeInitFor(GlobalValue *New, GlobalValue *Old) {
-}
 
-Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
-                      ValueMapTypeRemapper *TypeMapper,
-                      ValueMaterializer *Materializer) {
-  ValueToValueMapTy::iterator I = VM.find(V);
-  
+namespace {
+
+/// A basic block used in a BlockAddress whose function body is not yet
+/// materialized.
+struct DelayedBasicBlock {
+  BasicBlock *OldBB;
+  std::unique_ptr<BasicBlock> TempBB;
+
+  // Explicit move for MSVC.
+  DelayedBasicBlock(DelayedBasicBlock &&X)
+      : OldBB(std::move(X.OldBB)), TempBB(std::move(X.TempBB)) {}
+  DelayedBasicBlock &operator=(DelayedBasicBlock &&X) {
+    OldBB = std::move(X.OldBB);
+    TempBB = std::move(X.TempBB);
+    return *this;
+  }
+
+  DelayedBasicBlock(const BlockAddress &Old)
+      : OldBB(Old.getBasicBlock()),
+        TempBB(BasicBlock::Create(Old.getContext())) {}
+};
+
+struct WorklistEntry {
+  enum EntryKind {
+    MapGlobalInit,
+    MapAppendingVar,
+    MapGlobalAliasee,
+    RemapFunction
+  };
+  struct GVInitTy {
+    GlobalVariable *GV;
+    Constant *Init;
+  };
+  struct AppendingGVTy {
+    GlobalVariable *GV;
+    Constant *InitPrefix;
+  };
+  struct GlobalAliaseeTy {
+    GlobalAlias *GA;
+    Constant *Aliasee;
+  };
+
+  unsigned Kind : 2;
+  unsigned MCID : 29;
+  unsigned AppendingGVIsOldCtorDtor : 1;
+  unsigned AppendingGVNumNewMembers;
+  union {
+    GVInitTy GVInit;
+    AppendingGVTy AppendingGV;
+    GlobalAliaseeTy GlobalAliasee;
+    Function *RemapF;
+  } Data;
+};
+
+struct MappingContext {
+  ValueToValueMapTy *VM;
+  ValueMaterializer *Materializer = nullptr;
+
+  /// Construct a MappingContext with a value map and materializer.
+  explicit MappingContext(ValueToValueMapTy &VM,
+                          ValueMaterializer *Materializer = nullptr)
+      : VM(&VM), Materializer(Materializer) {}
+};
+
+class MDNodeMapper;
+class Mapper {
+  friend class MDNodeMapper;
+
+#ifndef NDEBUG
+  DenseSet<GlobalValue *> AlreadyScheduled;
+#endif
+
+  RemapFlags Flags;
+  ValueMapTypeRemapper *TypeMapper;
+  unsigned CurrentMCID = 0;
+  SmallVector<MappingContext, 2> MCs;
+  SmallVector<WorklistEntry, 4> Worklist;
+  SmallVector<DelayedBasicBlock, 1> DelayedBBs;
+  SmallVector<Constant *, 16> AppendingInits;
+
+public:
+  Mapper(ValueToValueMapTy &VM, RemapFlags Flags,
+         ValueMapTypeRemapper *TypeMapper, ValueMaterializer *Materializer)
+      : Flags(Flags), TypeMapper(TypeMapper),
+        MCs(1, MappingContext(VM, Materializer)) {}
+
+  /// ValueMapper should explicitly call \a flush() before destruction.
+  ~Mapper() { assert(!hasWorkToDo() && "Expected to be flushed"); }
+
+  bool hasWorkToDo() const { return !Worklist.empty(); }
+
+  unsigned
+  registerAlternateMappingContext(ValueToValueMapTy &VM,
+                                  ValueMaterializer *Materializer = nullptr) {
+    MCs.push_back(MappingContext(VM, Materializer));
+    return MCs.size() - 1;
+  }
+
+  void addFlags(RemapFlags Flags);
+
+  Value *mapValue(const Value *V);
+  void remapInstruction(Instruction *I);
+  void remapFunction(Function &F);
+
+  Constant *mapConstant(const Constant *C) {
+    return cast_or_null<Constant>(mapValue(C));
+  }
+
+  /// Map metadata.
+  ///
+  /// Find the mapping for MD.  Guarantees that the return will be resolved
+  /// (not an MDNode, or MDNode::isResolved() returns true).
+  Metadata *mapMetadata(const Metadata *MD);
+
+  void scheduleMapGlobalInitializer(GlobalVariable &GV, Constant &Init,
+                                    unsigned MCID);
+  void scheduleMapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
+                                    bool IsOldCtorDtor,
+                                    ArrayRef<Constant *> NewMembers,
+                                    unsigned MCID);
+  void scheduleMapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee,
+                                unsigned MCID);
+  void scheduleRemapFunction(Function &F, unsigned MCID);
+
+  void flush();
+
+private:
+  void mapGlobalInitializer(GlobalVariable &GV, Constant &Init);
+  void mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
+                            bool IsOldCtorDtor,
+                            ArrayRef<Constant *> NewMembers);
+  void mapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee);
+  void remapFunction(Function &F, ValueToValueMapTy &VM);
+
+  ValueToValueMapTy &getVM() { return *MCs[CurrentMCID].VM; }
+  ValueMaterializer *getMaterializer() { return MCs[CurrentMCID].Materializer; }
+
+  Value *mapBlockAddress(const BlockAddress &BA);
+
+  /// Map metadata that doesn't require visiting operands.
+  Optional<Metadata *> mapSimpleMetadata(const Metadata *MD);
+
+  Metadata *mapToMetadata(const Metadata *Key, Metadata *Val);
+  Metadata *mapToSelf(const Metadata *MD);
+};
+
+class MDNodeMapper {
+  Mapper &M;
+
+  /// Data about a node in \a UniquedGraph.
+  struct Data {
+    bool HasChanged = false;
+    unsigned ID = ~0u;
+    TempMDNode Placeholder;
+
+    Data() {}
+    Data(Data &&X)
+        : HasChanged(std::move(X.HasChanged)), ID(std::move(X.ID)),
+          Placeholder(std::move(X.Placeholder)) {}
+    Data &operator=(Data &&X) {
+      HasChanged = std::move(X.HasChanged);
+      ID = std::move(X.ID);
+      Placeholder = std::move(X.Placeholder);
+      return *this;
+    }
+  };
+
+  /// A graph of uniqued nodes.
+  struct UniquedGraph {
+    SmallDenseMap<const Metadata *, Data, 32> Info; // Node properties.
+    SmallVector<MDNode *, 16> POT;                  // Post-order traversal.
+
+    /// Propagate changed operands through the post-order traversal.
+    ///
+    /// Iteratively update \a Data::HasChanged for each node based on \a
+    /// Data::HasChanged of its operands, until fixed point.
+    void propagateChanges();
+
+    /// Get a forward reference to a node to use as an operand.
+    Metadata &getFwdReference(MDNode &Op);
+  };
+
+  /// Worklist of distinct nodes whose operands need to be remapped.
+  SmallVector<MDNode *, 16> DistinctWorklist;
+
+  // Storage for a UniquedGraph.
+  SmallDenseMap<const Metadata *, Data, 32> InfoStorage;
+  SmallVector<MDNode *, 16> POTStorage;
+
+public:
+  MDNodeMapper(Mapper &M) : M(M) {}
+
+  /// Map a metadata node (and its transitive operands).
+  ///
+  /// Map all the (unmapped) nodes in the subgraph under \c N.  The iterative
+  /// algorithm handles distinct nodes and uniqued node subgraphs using
+  /// different strategies.
+  ///
+  /// Distinct nodes are immediately mapped and added to \a DistinctWorklist
+  /// using \a mapDistinctNode().  Their mapping can always be computed
+  /// immediately without visiting operands, even if their operands change.
+  ///
+  /// The mapping for uniqued nodes depends on whether their operands change.
+  /// \a mapTopLevelUniquedNode() traverses the transitive uniqued subgraph of
+  /// a node to calculate uniqued node mappings in bulk.  Distinct leafs are
+  /// added to \a DistinctWorklist with \a mapDistinctNode().
+  ///
+  /// After mapping \c N itself, this function remaps the operands of the
+  /// distinct nodes in \a DistinctWorklist until the entire subgraph under \c
+  /// N has been mapped.
+  Metadata *map(const MDNode &N);
+
+private:
+  /// Map a top-level uniqued node and the uniqued subgraph underneath it.
+  ///
+  /// This builds up a post-order traversal of the (unmapped) uniqued subgraph
+  /// underneath \c FirstN and calculates the nodes' mapping.  Each node uses
+  /// the identity mapping (\a Mapper::mapToSelf()) as long as all of its
+  /// operands uses the identity mapping.
+  ///
+  /// The algorithm works as follows:
+  ///
+  ///  1. \a createPOT(): traverse the uniqued subgraph under \c FirstN and
+  ///     save the post-order traversal in the given \a UniquedGraph, tracking
+  ///     nodes' operands change.
+  ///
+  ///  2. \a UniquedGraph::propagateChanges(): propagate changed operands
+  ///     through the \a UniquedGraph until fixed point, following the rule
+  ///     that if a node changes, any node that references must also change.
+  ///
+  ///  3. \a mapNodesInPOT(): map the uniqued nodes, creating new uniqued nodes
+  ///     (referencing new operands) where necessary.
+  Metadata *mapTopLevelUniquedNode(const MDNode &FirstN);
+
+  /// Try to map the operand of an \a MDNode.
+  ///
+  /// If \c Op is already mapped, return the mapping.  If it's not an \a
+  /// MDNode, compute and return the mapping.  If it's a distinct \a MDNode,
+  /// return the result of \a mapDistinctNode().
+  ///
+  /// \return None if \c Op is an unmapped uniqued \a MDNode.
+  /// \post getMappedOp(Op) only returns None if this returns None.
+  Optional<Metadata *> tryToMapOperand(const Metadata *Op);
+
+  /// Map a distinct node.
+  ///
+  /// Return the mapping for the distinct node \c N, saving the result in \a
+  /// DistinctWorklist for later remapping.
+  ///
+  /// \pre \c N is not yet mapped.
+  /// \pre \c N.isDistinct().
+  MDNode *mapDistinctNode(const MDNode &N);
+
+  /// Get a previously mapped node.
+  Optional<Metadata *> getMappedOp(const Metadata *Op) const;
+
+  /// Create a post-order traversal of an unmapped uniqued node subgraph.
+  ///
+  /// This traverses the metadata graph deeply enough to map \c FirstN.  It
+  /// uses \a tryToMapOperand() (via \a Mapper::mapSimplifiedNode()), so any
+  /// metadata that has already been mapped will not be part of the POT.
+  ///
+  /// Each node that has a changed operand from outside the graph (e.g., a
+  /// distinct node, an already-mapped uniqued node, or \a ConstantAsMetadata)
+  /// is marked with \a Data::HasChanged.
+  ///
+  /// \return \c true if any nodes in \c G have \a Data::HasChanged.
+  /// \post \c G.POT is a post-order traversal ending with \c FirstN.
+  /// \post \a Data::hasChanged in \c G.Info indicates whether any node needs
+  /// to change because of operands outside the graph.
+  bool createPOT(UniquedGraph &G, const MDNode &FirstN);
+
+  /// Visit the operands of a uniqued node in the POT.
+  ///
+  /// Visit the operands in the range from \c I to \c E, returning the first
+  /// uniqued node we find that isn't yet in \c G.  \c I is always advanced to
+  /// where to continue the loop through the operands.
+  ///
+  /// This sets \c HasChanged if any of the visited operands change.
+  MDNode *visitOperands(UniquedGraph &G, MDNode::op_iterator &I,
+                        MDNode::op_iterator E, bool &HasChanged);
+
+  /// Map all the nodes in the given uniqued graph.
+  ///
+  /// This visits all the nodes in \c G in post-order, using the identity
+  /// mapping or creating a new node depending on \a Data::HasChanged.
+  ///
+  /// \pre \a getMappedOp() returns None for nodes in \c G, but not for any of
+  /// their operands outside of \c G.
+  /// \pre \a Data::HasChanged is true for a node in \c G iff any of its
+  /// operands have changed.
+  /// \post \a getMappedOp() returns the mapped node for every node in \c G.
+  void mapNodesInPOT(UniquedGraph &G);
+
+  /// Remap a node's operands using the given functor.
+  ///
+  /// Iterate through the operands of \c N and update them in place using \c
+  /// mapOperand.
+  ///
+  /// \pre N.isDistinct() or N.isTemporary().
+  template <class OperandMapper>
+  void remapOperands(MDNode &N, OperandMapper mapOperand);
+};
+
+} // end namespace
+
+Value *Mapper::mapValue(const Value *V) {
+  ValueToValueMapTy::iterator I = getVM().find(V);
+
   // If the value already exists in the map, use it.
-  if (I != VM.end() && I->second) return I->second;
-  
+  if (I != getVM().end()) {
+    assert(I->second && "Unexpected null mapping");
+    return I->second;
+  }
+
   // If we have a materializer and it can materialize a value, use that.
-  if (Materializer) {
-    if (Value *NewV =
-            Materializer->materializeDeclFor(const_cast<Value *>(V))) {
-      VM[V] = NewV;
-      if (auto *NewGV = dyn_cast<GlobalValue>(NewV))
-        Materializer->materializeInitFor(
-            NewGV, const_cast<GlobalValue *>(cast<GlobalValue>(V)));
+  if (auto *Materializer = getMaterializer()) {
+    if (Value *NewV = Materializer->materialize(const_cast<Value *>(V))) {
+      getVM()[V] = NewV;
       return NewV;
     }
   }
@@ -51,13 +356,9 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
   // Global values do not need to be seeded into the VM if they
   // are using the identity mapping.
   if (isa<GlobalValue>(V)) {
-    if (Flags & RF_NullMapMissingGlobalValues) {
-      assert(!(Flags & RF_IgnoreMissingEntries) &&
-             "Illegal to specify both RF_NullMapMissingGlobalValues and "
-             "RF_IgnoreMissingEntries");
+    if (Flags & RF_NullMapMissingGlobalValues)
       return nullptr;
-    }
-    return VM[V] = const_cast<Value*>(V);
+    return getVM()[V] = const_cast<Value *>(V);
   }
 
   if (const InlineAsm *IA = dyn_cast<InlineAsm>(V)) {
@@ -70,28 +371,39 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
         V = InlineAsm::get(NewTy, IA->getAsmString(), IA->getConstraintString(),
                            IA->hasSideEffects(), IA->isAlignStack());
     }
-    
-    return VM[V] = const_cast<Value*>(V);
+
+    return getVM()[V] = const_cast<Value *>(V);
   }
 
   if (const auto *MDV = dyn_cast<MetadataAsValue>(V)) {
     const Metadata *MD = MDV->getMetadata();
+
+    if (auto *LAM = dyn_cast<LocalAsMetadata>(MD)) {
+      // Look through to grab the local value.
+      if (Value *LV = mapValue(LAM->getValue())) {
+        if (V == LAM->getValue())
+          return const_cast<Value *>(V);
+        return MetadataAsValue::get(V->getContext(), ValueAsMetadata::get(LV));
+      }
+
+      // FIXME: always return nullptr once Verifier::verifyDominatesUse()
+      // ensures metadata operands only reference defined SSA values.
+      return (Flags & RF_IgnoreMissingLocals)
+                 ? nullptr
+                 : MetadataAsValue::get(V->getContext(),
+                                        MDTuple::get(V->getContext(), None));
+    }
+
     // If this is a module-level metadata and we know that nothing at the module
     // level is changing, then use an identity mapping.
-    if (!isa<LocalAsMetadata>(MD) && (Flags & RF_NoModuleLevelChanges))
-      return VM[V] = const_cast<Value *>(V);
-
-    auto *MappedMD = MapMetadata(MD, VM, Flags, TypeMapper, Materializer);
-    if (MD == MappedMD || (!MappedMD && (Flags & RF_IgnoreMissingEntries)))
-      return VM[V] = const_cast<Value *>(V);
-
-    // FIXME: This assert crashes during bootstrap, but I think it should be
-    // correct.  For now, just match behaviour from before the metadata/value
-    // split.
-    //
-    //    assert((MappedMD || (Flags & RF_NullMapMissingGlobalValues)) &&
-    //           "Referenced metadata value not in value map");
-    return VM[V] = MetadataAsValue::get(V->getContext(), MappedMD);
+    if (Flags & RF_NoModuleLevelChanges)
+      return getVM()[V] = const_cast<Value *>(V);
+
+    // Map the metadata and turn it into a value.
+    auto *MappedMD = mapMetadata(MD);
+    if (MD == MappedMD)
+      return getVM()[V] = const_cast<Value *>(V);
+    return getVM()[V] = MetadataAsValue::get(V->getContext(), MappedMD);
   }
 
   // Okay, this either must be a constant (which may or may not be mappable) or
@@ -99,25 +411,31 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
   Constant *C = const_cast<Constant*>(dyn_cast<Constant>(V));
   if (!C)
     return nullptr;
-  
-  if (BlockAddress *BA = dyn_cast<BlockAddress>(C)) {
-    Function *F = 
-      cast<Function>(MapValue(BA->getFunction(), VM, Flags, TypeMapper, Materializer));
-    BasicBlock *BB = cast_or_null<BasicBlock>(MapValue(BA->getBasicBlock(), VM,
-                                                       Flags, TypeMapper, Materializer));
-    return VM[V] = BlockAddress::get(F, BB ? BB : BA->getBasicBlock());
-  }
-  
+
+  if (BlockAddress *BA = dyn_cast<BlockAddress>(C))
+    return mapBlockAddress(*BA);
+
+  auto mapValueOrNull = [this](Value *V) {
+    auto Mapped = mapValue(V);
+    assert((Mapped || (Flags & RF_NullMapMissingGlobalValues)) &&
+           "Unexpected null mapping for constant operand without "
+           "NullMapMissingGlobalValues flag");
+    return Mapped;
+  };
+
   // Otherwise, we have some other constant to remap.  Start by checking to see
   // if all operands have an identity remapping.
   unsigned OpNo = 0, NumOperands = C->getNumOperands();
   Value *Mapped = nullptr;
   for (; OpNo != NumOperands; ++OpNo) {
     Value *Op = C->getOperand(OpNo);
-    Mapped = MapValue(Op, VM, Flags, TypeMapper, Materializer);
-    if (Mapped != C) break;
+    Mapped = mapValueOrNull(Op);
+    if (!Mapped)
+      return nullptr;
+    if (Mapped != Op)
+      break;
   }
-  
+
   // See if the type mapper wants to remap the type as well.
   Type *NewTy = C->getType();
   if (TypeMapper)
@@ -126,23 +444,26 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
   // If the result type and all operands match up, then just insert an identity
   // mapping.
   if (OpNo == NumOperands && NewTy == C->getType())
-    return VM[V] = C;
-  
+    return getVM()[V] = C;
+
   // Okay, we need to create a new constant.  We've already processed some or
   // all of the operands, set them all up now.
   SmallVector<Constant*, 8> Ops;
   Ops.reserve(NumOperands);
   for (unsigned j = 0; j != OpNo; ++j)
     Ops.push_back(cast<Constant>(C->getOperand(j)));
-  
+
   // If one of the operands mismatch, push it and the other mapped operands.
   if (OpNo != NumOperands) {
     Ops.push_back(cast<Constant>(Mapped));
-  
+
     // Map the rest of the operands that aren't processed yet.
-    for (++OpNo; OpNo != NumOperands; ++OpNo)
-      Ops.push_back(MapValue(cast<Constant>(C->getOperand(OpNo)), VM,
-                             Flags, TypeMapper, Materializer));
+    for (++OpNo; OpNo != NumOperands; ++OpNo) {
+      Mapped = mapValueOrNull(C->getOperand(OpNo));
+      if (!Mapped)
+        return nullptr;
+      Ops.push_back(cast<Constant>(Mapped));
+    }
   }
   Type *NewSrcTy = nullptr;
   if (TypeMapper)
@@ -150,309 +471,407 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
       NewSrcTy = TypeMapper->remapType(GEPO->getSourceElementType());
 
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
-    return VM[V] = CE->getWithOperands(Ops, NewTy, false, NewSrcTy);
+    return getVM()[V] = CE->getWithOperands(Ops, NewTy, false, NewSrcTy);
   if (isa<ConstantArray>(C))
-    return VM[V] = ConstantArray::get(cast<ArrayType>(NewTy), Ops);
+    return getVM()[V] = ConstantArray::get(cast<ArrayType>(NewTy), Ops);
   if (isa<ConstantStruct>(C))
-    return VM[V] = ConstantStruct::get(cast<StructType>(NewTy), Ops);
+    return getVM()[V] = ConstantStruct::get(cast<StructType>(NewTy), Ops);
   if (isa<ConstantVector>(C))
-    return VM[V] = ConstantVector::get(Ops);
+    return getVM()[V] = ConstantVector::get(Ops);
   // If this is a no-operand constant, it must be because the type was remapped.
   if (isa<UndefValue>(C))
-    return VM[V] = UndefValue::get(NewTy);
+    return getVM()[V] = UndefValue::get(NewTy);
   if (isa<ConstantAggregateZero>(C))
-    return VM[V] = ConstantAggregateZero::get(NewTy);
+    return getVM()[V] = ConstantAggregateZero::get(NewTy);
   assert(isa<ConstantPointerNull>(C));
-  return VM[V] = ConstantPointerNull::get(cast<PointerType>(NewTy));
-}
-
-static Metadata *mapToMetadata(ValueToValueMapTy &VM, const Metadata *Key,
-                               Metadata *Val, ValueMaterializer *Materializer,
-                               RemapFlags Flags) {
-  VM.MD()[Key].reset(Val);
-  if (Materializer && !(Flags & RF_HaveUnmaterializedMetadata)) {
-    auto *N = dyn_cast_or_null<MDNode>(Val);
-    // Need to invoke this once we have non-temporary MD.
-    if (!N || !N->isTemporary())
-      Materializer->replaceTemporaryMetadata(Key, Val);
+  return getVM()[V] = ConstantPointerNull::get(cast<PointerType>(NewTy));
+}
+
+Value *Mapper::mapBlockAddress(const BlockAddress &BA) {
+  Function *F = cast<Function>(mapValue(BA.getFunction()));
+
+  // F may not have materialized its initializer.  In that case, create a
+  // dummy basic block for now, and replace it once we've materialized all
+  // the initializers.
+  BasicBlock *BB;
+  if (F->empty()) {
+    DelayedBBs.push_back(DelayedBasicBlock(BA));
+    BB = DelayedBBs.back().TempBB.get();
+  } else {
+    BB = cast_or_null<BasicBlock>(mapValue(BA.getBasicBlock()));
   }
-  return Val;
+
+  return getVM()[&BA] = BlockAddress::get(F, BB ? BB : BA.getBasicBlock());
 }
 
-static Metadata *mapToSelf(ValueToValueMapTy &VM, const Metadata *MD,
-                           ValueMaterializer *Materializer, RemapFlags Flags) {
-  return mapToMetadata(VM, MD, const_cast<Metadata *>(MD), Materializer, Flags);
+Metadata *Mapper::mapToMetadata(const Metadata *Key, Metadata *Val) {
+  getVM().MD()[Key].reset(Val);
+  return Val;
 }
 
-static Metadata *MapMetadataImpl(const Metadata *MD,
-                                 SmallVectorImpl<MDNode *> &DistinctWorklist,
-                                 ValueToValueMapTy &VM, RemapFlags Flags,
-                                 ValueMapTypeRemapper *TypeMapper,
-                                 ValueMaterializer *Materializer);
+Metadata *Mapper::mapToSelf(const Metadata *MD) {
+  return mapToMetadata(MD, const_cast<Metadata *>(MD));
+}
 
-static Metadata *mapMetadataOp(Metadata *Op,
-                               SmallVectorImpl<MDNode *> &DistinctWorklist,
-                               ValueToValueMapTy &VM, RemapFlags Flags,
-                               ValueMapTypeRemapper *TypeMapper,
-                               ValueMaterializer *Materializer) {
+Optional<Metadata *> MDNodeMapper::tryToMapOperand(const Metadata *Op) {
   if (!Op)
     return nullptr;
 
-  if (Materializer && !Materializer->isMetadataNeeded(Op))
+  if (Optional<Metadata *> MappedOp = M.mapSimpleMetadata(Op)) {
+#ifndef NDEBUG
+    if (auto *CMD = dyn_cast<ConstantAsMetadata>(Op))
+      assert((!*MappedOp || M.getVM().count(CMD->getValue()) ||
+              M.getVM().getMappedMD(Op)) &&
+             "Expected Value to be memoized");
+    else
+      assert((isa<MDString>(Op) || M.getVM().getMappedMD(Op)) &&
+             "Expected result to be memoized");
+#endif
+    return *MappedOp;
+  }
+
+  const MDNode &N = *cast<MDNode>(Op);
+  if (N.isDistinct())
+    return mapDistinctNode(N);
+  return None;
+}
+
+MDNode *MDNodeMapper::mapDistinctNode(const MDNode &N) {
+  assert(N.isDistinct() && "Expected a distinct node");
+  assert(!M.getVM().getMappedMD(&N) && "Expected an unmapped node");
+  DistinctWorklist.push_back(cast<MDNode>(
+      (M.Flags & RF_MoveDistinctMDs)
+          ? M.mapToSelf(&N)
+          : M.mapToMetadata(&N, MDNode::replaceWithDistinct(N.clone()))));
+  return DistinctWorklist.back();
+}
+
+static ConstantAsMetadata *wrapConstantAsMetadata(const ConstantAsMetadata &CMD,
+                                                  Value *MappedV) {
+  if (CMD.getValue() == MappedV)
+    return const_cast<ConstantAsMetadata *>(&CMD);
+  return MappedV ? ConstantAsMetadata::getConstant(MappedV) : nullptr;
+}
+
+Optional<Metadata *> MDNodeMapper::getMappedOp(const Metadata *Op) const {
+  if (!Op)
     return nullptr;
 
-  if (Metadata *MappedOp = MapMetadataImpl(Op, DistinctWorklist, VM, Flags,
-                                           TypeMapper, Materializer))
-    return MappedOp;
-  // Use identity map if MappedOp is null and we can ignore missing entries.
-  if (Flags & RF_IgnoreMissingEntries)
+  if (Optional<Metadata *> MappedOp = M.getVM().getMappedMD(Op))
+    return *MappedOp;
+
+  if (isa<MDString>(Op))
+    return const_cast<Metadata *>(Op);
+
+  if (auto *CMD = dyn_cast<ConstantAsMetadata>(Op))
+    return wrapConstantAsMetadata(*CMD, M.getVM().lookup(CMD->getValue()));
+
+  return None;
+}
+
+Metadata &MDNodeMapper::UniquedGraph::getFwdReference(MDNode &Op) {
+  auto Where = Info.find(&Op);
+  assert(Where != Info.end() && "Expected a valid reference");
+
+  auto &OpD = Where->second;
+  if (!OpD.HasChanged)
     return Op;
 
-  // FIXME: This assert crashes during bootstrap, but I think it should be
-  // correct.  For now, just match behaviour from before the metadata/value
-  // split.
-  //
-  //    assert((Flags & RF_NullMapMissingGlobalValues) &&
-  //           "Referenced metadata not in value map!");
-  return nullptr;
+  // Lazily construct a temporary node.
+  if (!OpD.Placeholder)
+    OpD.Placeholder = Op.clone();
+
+  return *OpD.Placeholder;
 }
 
-/// Resolve uniquing cycles involving the given metadata.
-static void resolveCycles(Metadata *MD, bool AllowTemps) {
-  if (auto *N = dyn_cast_or_null<MDNode>(MD)) {
-    if (AllowTemps && N->isTemporary())
-      return;
-    if (!N->isResolved()) {
-      if (AllowTemps)
-        // Note that this will drop RAUW support on any temporaries, which
-        // blocks uniquing. If this ends up being an issue, in the future
-        // we can experiment with delaying resolving these nodes until
-        // after metadata is fully materialized (i.e. when linking metadata
-        // as a postpass after function importing).
-        N->resolveNonTemporaries();
-      else
-        N->resolveCycles();
-    }
+template <class OperandMapper>
+void MDNodeMapper::remapOperands(MDNode &N, OperandMapper mapOperand) {
+  assert(!N.isUniqued() && "Expected distinct or temporary nodes");
+  for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) {
+    Metadata *Old = N.getOperand(I);
+    Metadata *New = mapOperand(Old);
+
+    if (Old != New)
+      N.replaceOperandWith(I, New);
   }
 }
 
-/// Remap the operands of an MDNode.
-///
-/// If \c Node is temporary, uniquing cycles are ignored.  If \c Node is
-/// distinct, uniquing cycles are resolved as they're found.
-///
-/// \pre \c Node.isDistinct() or \c Node.isTemporary().
-static bool remapOperands(MDNode &Node,
-                          SmallVectorImpl<MDNode *> &DistinctWorklist,
-                          ValueToValueMapTy &VM, RemapFlags Flags,
-                          ValueMapTypeRemapper *TypeMapper,
-                          ValueMaterializer *Materializer) {
-  assert(!Node.isUniqued() && "Expected temporary or distinct node");
-  const bool IsDistinct = Node.isDistinct();
-
-  bool AnyChanged = false;
-  for (unsigned I = 0, E = Node.getNumOperands(); I != E; ++I) {
-    Metadata *Old = Node.getOperand(I);
-    Metadata *New = mapMetadataOp(Old, DistinctWorklist, VM, Flags, TypeMapper,
-                                  Materializer);
-    if (Old != New) {
-      AnyChanged = true;
-      Node.replaceOperandWith(I, New);
-
-      // Resolve uniquing cycles underneath distinct nodes on the fly so they
-      // don't infect later operands.
-      if (IsDistinct)
-        resolveCycles(New, Flags & RF_HaveUnmaterializedMetadata);
+namespace {
+/// An entry in the worklist for the post-order traversal.
+struct POTWorklistEntry {
+  MDNode *N;              ///< Current node.
+  MDNode::op_iterator Op; ///< Current operand of \c N.
+
+  /// Keep a flag of whether operands have changed in the worklist to avoid
+  /// hitting the map in \a UniquedGraph.
+  bool HasChanged = false;
+
+  POTWorklistEntry(MDNode &N) : N(&N), Op(N.op_begin()) {}
+};
+} // end namespace
+
+bool MDNodeMapper::createPOT(UniquedGraph &G, const MDNode &FirstN) {
+  assert(G.Info.empty() && "Expected a fresh traversal");
+  assert(FirstN.isUniqued() && "Expected uniqued node in POT");
+
+  // Construct a post-order traversal of the uniqued subgraph under FirstN.
+  bool AnyChanges = false;
+  SmallVector<POTWorklistEntry, 16> Worklist;
+  Worklist.push_back(POTWorklistEntry(const_cast<MDNode &>(FirstN)));
+  (void)G.Info[&FirstN];
+  while (!Worklist.empty()) {
+    // Start or continue the traversal through the this node's operands.
+    auto &WE = Worklist.back();
+    if (MDNode *N = visitOperands(G, WE.Op, WE.N->op_end(), WE.HasChanged)) {
+      // Push a new node to traverse first.
+      Worklist.push_back(POTWorklistEntry(*N));
+      continue;
     }
+
+    // Push the node onto the POT.
+    assert(WE.N->isUniqued() && "Expected only uniqued nodes");
+    assert(WE.Op == WE.N->op_end() && "Expected to visit all operands");
+    auto &D = G.Info[WE.N];
+    AnyChanges |= D.HasChanged = WE.HasChanged;
+    D.ID = G.POT.size();
+    G.POT.push_back(WE.N);
+
+    // Pop the node off the worklist.
+    Worklist.pop_back();
   }
+  return AnyChanges;
+}
 
-  return AnyChanged;
-}
-
-/// Map a distinct MDNode.
-///
-/// Whether distinct nodes change is independent of their operands.  If \a
-/// RF_MoveDistinctMDs, then they are reused, and their operands remapped in
-/// place; effectively, they're moved from one graph to another.  Otherwise,
-/// they're cloned/duplicated, and the new copy's operands are remapped.
-static Metadata *mapDistinctNode(const MDNode *Node,
-                                 SmallVectorImpl<MDNode *> &DistinctWorklist,
-                                 ValueToValueMapTy &VM, RemapFlags Flags,
-                                 ValueMapTypeRemapper *TypeMapper,
-                                 ValueMaterializer *Materializer) {
-  assert(Node->isDistinct() && "Expected distinct node");
-
-  MDNode *NewMD;
-  if (Flags & RF_MoveDistinctMDs)
-    NewMD = const_cast<MDNode *>(Node);
-  else
-    NewMD = MDNode::replaceWithDistinct(Node->clone());
-
-  // Remap operands later.
-  DistinctWorklist.push_back(NewMD);
-  return mapToMetadata(VM, Node, NewMD, Materializer, Flags);
-}
-
-/// \brief Map a uniqued MDNode.
-///
-/// Uniqued nodes may not need to be recreated (they may map to themselves).
-static Metadata *mapUniquedNode(const MDNode *Node,
-                                SmallVectorImpl<MDNode *> &DistinctWorklist,
-                                ValueToValueMapTy &VM, RemapFlags Flags,
-                                ValueMapTypeRemapper *TypeMapper,
-                                ValueMaterializer *Materializer) {
-  assert(((Flags & RF_HaveUnmaterializedMetadata) || Node->isUniqued()) &&
-         "Expected uniqued node");
-
-  // Create a temporary node and map it upfront in case we have a uniquing
-  // cycle.  If necessary, this mapping will get updated by RAUW logic before
-  // returning.
-  auto ClonedMD = Node->clone();
-  mapToMetadata(VM, Node, ClonedMD.get(), Materializer, Flags);
-  if (!remapOperands(*ClonedMD, DistinctWorklist, VM, Flags, TypeMapper,
-                     Materializer)) {
-    // No operands changed, so use the original.
-    ClonedMD->replaceAllUsesWith(const_cast<MDNode *>(Node));
-    // Even though replaceAllUsesWith would have replaced the value map
-    // entry, we need to explictly map with the final non-temporary node
-    // to replace any temporary metadata via the callback.
-    return mapToSelf(VM, Node, Materializer, Flags);
+MDNode *MDNodeMapper::visitOperands(UniquedGraph &G, MDNode::op_iterator &I,
+                                    MDNode::op_iterator E, bool &HasChanged) {
+  while (I != E) {
+    Metadata *Op = *I++; // Increment even on early return.
+    if (Optional<Metadata *> MappedOp = tryToMapOperand(Op)) {
+      // Check if the operand changes.
+      HasChanged |= Op != *MappedOp;
+      continue;
+    }
+
+    // A uniqued metadata node.
+    MDNode &OpN = *cast<MDNode>(Op);
+    assert(OpN.isUniqued() &&
+           "Only uniqued operands cannot be mapped immediately");
+    if (G.Info.insert(std::make_pair(&OpN, Data())).second)
+      return &OpN; // This is a new one.  Return it.
   }
+  return nullptr;
+}
 
-  // Uniquify the cloned node. Explicitly map it with the final non-temporary
-  // node so that replacement of temporary metadata via the callback occurs.
-  return mapToMetadata(VM, Node,
-                       MDNode::replaceWithUniqued(std::move(ClonedMD)),
-                       Materializer, Flags);
+void MDNodeMapper::UniquedGraph::propagateChanges() {
+  bool AnyChanges;
+  do {
+    AnyChanges = false;
+    for (MDNode *N : POT) {
+      auto &D = Info[N];
+      if (D.HasChanged)
+        continue;
+
+      if (!llvm::any_of(N->operands(), [&](const Metadata *Op) {
+            auto Where = Info.find(Op);
+            return Where != Info.end() && Where->second.HasChanged;
+          }))
+        continue;
+
+      AnyChanges = D.HasChanged = true;
+    }
+  } while (AnyChanges);
 }
 
-static Metadata *MapMetadataImpl(const Metadata *MD,
-                                 SmallVectorImpl<MDNode *> &DistinctWorklist,
-                                 ValueToValueMapTy &VM, RemapFlags Flags,
-                                 ValueMapTypeRemapper *TypeMapper,
-                                 ValueMaterializer *Materializer) {
-  // If the value already exists in the map, use it.
-  if (Metadata *NewMD = VM.MD().lookup(MD).get())
-    return NewMD;
+void MDNodeMapper::mapNodesInPOT(UniquedGraph &G) {
+  // Construct uniqued nodes, building forward references as necessary.
+  SmallVector<MDNode *, 16> CyclicNodes;
+  for (auto *N : G.POT) {
+    auto &D = G.Info[N];
+    if (!D.HasChanged) {
+      // The node hasn't changed.
+      M.mapToSelf(N);
+      continue;
+    }
 
-  if (isa<MDString>(MD))
-    return mapToSelf(VM, MD, Materializer, Flags);
-
-  if (isa<ConstantAsMetadata>(MD))
-    if ((Flags & RF_NoModuleLevelChanges))
-      return mapToSelf(VM, MD, Materializer, Flags);
-
-  if (const auto *VMD = dyn_cast<ValueAsMetadata>(MD)) {
-    Value *MappedV =
-        MapValue(VMD->getValue(), VM, Flags, TypeMapper, Materializer);
-    if (VMD->getValue() == MappedV ||
-        (!MappedV && (Flags & RF_IgnoreMissingEntries)))
-      return mapToSelf(VM, MD, Materializer, Flags);
-
-    // FIXME: This assert crashes during bootstrap, but I think it should be
-    // correct.  For now, just match behaviour from before the metadata/value
-    // split.
-    //
-    //    assert((MappedV || (Flags & RF_NullMapMissingGlobalValues)) &&
-    //           "Referenced metadata not in value map!");
-    if (MappedV)
-      return mapToMetadata(VM, MD, ValueAsMetadata::get(MappedV), Materializer,
-                           Flags);
-    return nullptr;
+    // Remember whether this node had a placeholder.
+    bool HadPlaceholder(D.Placeholder);
+
+    // Clone the uniqued node and remap the operands.
+    TempMDNode ClonedN = D.Placeholder ? std::move(D.Placeholder) : N->clone();
+    remapOperands(*ClonedN, [this, &D, &G](Metadata *Old) {
+      if (Optional<Metadata *> MappedOp = getMappedOp(Old))
+        return *MappedOp;
+      assert(G.Info[Old].ID > D.ID && "Expected a forward reference");
+      return &G.getFwdReference(*cast<MDNode>(Old));
+    });
+
+    auto *NewN = MDNode::replaceWithUniqued(std::move(ClonedN));
+    M.mapToMetadata(N, NewN);
+
+    // Nodes that were referenced out of order in the POT are involved in a
+    // uniquing cycle.
+    if (HadPlaceholder)
+      CyclicNodes.push_back(NewN);
   }
 
-  // Note: this cast precedes the Flags check so we always get its associated
-  // assertion.
-  const MDNode *Node = cast<MDNode>(MD);
+  // Resolve cycles.
+  for (auto *N : CyclicNodes)
+    if (!N->isResolved())
+      N->resolveCycles();
+}
 
-  // If this is a module-level metadata and we know that nothing at the
-  // module level is changing, then use an identity mapping.
-  if (Flags & RF_NoModuleLevelChanges)
-    return mapToSelf(VM, MD, Materializer, Flags);
+Metadata *MDNodeMapper::map(const MDNode &N) {
+  assert(DistinctWorklist.empty() && "MDNodeMapper::map is not recursive");
+  assert(!(M.Flags & RF_NoModuleLevelChanges) &&
+         "MDNodeMapper::map assumes module-level changes");
 
   // Require resolved nodes whenever metadata might be remapped.
-  assert(((Flags & RF_HaveUnmaterializedMetadata) || Node->isResolved()) &&
-         "Unexpected unresolved node");
-
-  if (Materializer && Node->isTemporary()) {
-    assert(Flags & RF_HaveUnmaterializedMetadata);
-    Metadata *TempMD =
-        Materializer->mapTemporaryMetadata(const_cast<Metadata *>(MD));
-    // If the above callback returned an existing temporary node, use it
-    // instead of the current temporary node. This happens when earlier
-    // function importing passes already created and saved a temporary
-    // metadata node for the same value id.
-    if (TempMD) {
-      mapToMetadata(VM, MD, TempMD, Materializer, Flags);
-      return TempMD;
-    }
+  assert(N.isResolved() && "Unexpected unresolved node");
+
+  Metadata *MappedN =
+      N.isUniqued() ? mapTopLevelUniquedNode(N) : mapDistinctNode(N);
+  while (!DistinctWorklist.empty())
+    remapOperands(*DistinctWorklist.pop_back_val(), [this](Metadata *Old) {
+      if (Optional<Metadata *> MappedOp = tryToMapOperand(Old))
+        return *MappedOp;
+      return mapTopLevelUniquedNode(*cast<MDNode>(Old));
+    });
+  return MappedN;
+}
+
+Metadata *MDNodeMapper::mapTopLevelUniquedNode(const MDNode &FirstN) {
+  assert(FirstN.isUniqued() && "Expected uniqued node");
+
+  // Create a post-order traversal of uniqued nodes under FirstN.
+  UniquedGraph G;
+  if (!createPOT(G, FirstN)) {
+    // Return early if no nodes have changed.
+    for (const MDNode *N : G.POT)
+      M.mapToSelf(N);
+    return &const_cast<MDNode &>(FirstN);
   }
 
-  if (Node->isDistinct())
-    return mapDistinctNode(Node, DistinctWorklist, VM, Flags, TypeMapper,
-                           Materializer);
+  // Update graph with all nodes that have changed.
+  G.propagateChanges();
 
-  return mapUniquedNode(Node, DistinctWorklist, VM, Flags, TypeMapper,
-                        Materializer);
+  // Map all the nodes in the graph.
+  mapNodesInPOT(G);
+
+  // Return the original node, remapped.
+  return *getMappedOp(&FirstN);
 }
 
-Metadata *llvm::MapMetadata(const Metadata *MD, ValueToValueMapTy &VM,
-                            RemapFlags Flags, ValueMapTypeRemapper *TypeMapper,
-                            ValueMaterializer *Materializer) {
-  SmallVector<MDNode *, 8> DistinctWorklist;
-  Metadata *NewMD = MapMetadataImpl(MD, DistinctWorklist, VM, Flags, TypeMapper,
-                                    Materializer);
+namespace {
 
-  // When there are no module-level changes, it's possible that the metadata
-  // graph has temporaries.  Skip the logic to resolve cycles, since it's
-  // unnecessary (and invalid) in that case.
-  if (Flags & RF_NoModuleLevelChanges)
-    return NewMD;
+struct MapMetadataDisabler {
+  ValueToValueMapTy &VM;
 
-  // Resolve cycles involving the entry metadata.
-  resolveCycles(NewMD, Flags & RF_HaveUnmaterializedMetadata);
+  MapMetadataDisabler(ValueToValueMapTy &VM) : VM(VM) {
+    VM.disableMapMetadata();
+  }
+  ~MapMetadataDisabler() { VM.enableMapMetadata(); }
+};
 
-  // Remap the operands of distinct MDNodes.
-  while (!DistinctWorklist.empty())
-    remapOperands(*DistinctWorklist.pop_back_val(), DistinctWorklist, VM, Flags,
-                  TypeMapper, Materializer);
+} // end namespace
 
-  return NewMD;
+Optional<Metadata *> Mapper::mapSimpleMetadata(const Metadata *MD) {
+  // If the value already exists in the map, use it.
+  if (Optional<Metadata *> NewMD = getVM().getMappedMD(MD))
+    return *NewMD;
+
+  if (isa<MDString>(MD))
+    return const_cast<Metadata *>(MD);
+
+  // This is a module-level metadata.  If nothing at the module level is
+  // changing, use an identity mapping.
+  if ((Flags & RF_NoModuleLevelChanges))
+    return const_cast<Metadata *>(MD);
+
+  if (auto *CMD = dyn_cast<ConstantAsMetadata>(MD)) {
+    // Disallow recursion into metadata mapping through mapValue.
+    MapMetadataDisabler MMD(getVM());
+
+    // Don't memoize ConstantAsMetadata.  Instead of lasting until the
+    // LLVMContext is destroyed, they can be deleted when the GlobalValue they
+    // reference is destructed.  These aren't super common, so the extra
+    // indirection isn't that expensive.
+    return wrapConstantAsMetadata(*CMD, mapValue(CMD->getValue()));
+  }
+
+  assert(isa<MDNode>(MD) && "Expected a metadata node");
+
+  return None;
 }
 
-MDNode *llvm::MapMetadata(const MDNode *MD, ValueToValueMapTy &VM,
-                          RemapFlags Flags, ValueMapTypeRemapper *TypeMapper,
-                          ValueMaterializer *Materializer) {
-  return cast<MDNode>(MapMetadata(static_cast<const Metadata *>(MD), VM, Flags,
-                                  TypeMapper, Materializer));
+Metadata *Mapper::mapMetadata(const Metadata *MD) {
+  assert(MD && "Expected valid metadata");
+  assert(!isa<LocalAsMetadata>(MD) && "Unexpected local metadata");
+
+  if (Optional<Metadata *> NewMD = mapSimpleMetadata(MD))
+    return *NewMD;
+
+  return MDNodeMapper(*this).map(*cast<MDNode>(MD));
+}
+
+void Mapper::flush() {
+  // Flush out the worklist of global values.
+  while (!Worklist.empty()) {
+    WorklistEntry E = Worklist.pop_back_val();
+    CurrentMCID = E.MCID;
+    switch (E.Kind) {
+    case WorklistEntry::MapGlobalInit:
+      E.Data.GVInit.GV->setInitializer(mapConstant(E.Data.GVInit.Init));
+      break;
+    case WorklistEntry::MapAppendingVar: {
+      unsigned PrefixSize = AppendingInits.size() - E.AppendingGVNumNewMembers;
+      mapAppendingVariable(*E.Data.AppendingGV.GV,
+                           E.Data.AppendingGV.InitPrefix,
+                           E.AppendingGVIsOldCtorDtor,
+                           makeArrayRef(AppendingInits).slice(PrefixSize));
+      AppendingInits.resize(PrefixSize);
+      break;
+    }
+    case WorklistEntry::MapGlobalAliasee:
+      E.Data.GlobalAliasee.GA->setAliasee(
+          mapConstant(E.Data.GlobalAliasee.Aliasee));
+      break;
+    case WorklistEntry::RemapFunction:
+      remapFunction(*E.Data.RemapF);
+      break;
+    }
+  }
+  CurrentMCID = 0;
+
+  // Finish logic for block addresses now that all global values have been
+  // handled.
+  while (!DelayedBBs.empty()) {
+    DelayedBasicBlock DBB = DelayedBBs.pop_back_val();
+    BasicBlock *BB = cast_or_null<BasicBlock>(mapValue(DBB.OldBB));
+    DBB.TempBB->replaceAllUsesWith(BB ? BB : DBB.OldBB);
+  }
 }
 
-/// RemapInstruction - Convert the instruction operands from referencing the
-/// current values into those specified by VMap.
-///
-void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap,
-                            RemapFlags Flags, ValueMapTypeRemapper *TypeMapper,
-                            ValueMaterializer *Materializer){
+void Mapper::remapInstruction(Instruction *I) {
   // Remap operands.
-  for (User::op_iterator op = I->op_begin(), E = I->op_end(); op != E; ++op) {
-    Value *V = MapValue(*op, VMap, Flags, TypeMapper, Materializer);
+  for (Use &Op : I->operands()) {
+    Value *V = mapValue(Op);
     // If we aren't ignoring missing entries, assert that something happened.
     if (V)
-      *op = V;
+      Op = V;
     else
-      assert((Flags & RF_IgnoreMissingEntries) &&
+      assert((Flags & RF_IgnoreMissingLocals) &&
              "Referenced value not in value map!");
   }
 
   // Remap phi nodes' incoming blocks.
   if (PHINode *PN = dyn_cast<PHINode>(I)) {
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
-      Value *V = MapValue(PN->getIncomingBlock(i), VMap, Flags);
+      Value *V = mapValue(PN->getIncomingBlock(i));
       // If we aren't ignoring missing entries, assert that something happened.
       if (V)
         PN->setIncomingBlock(i, cast<BasicBlock>(V));
       else
-        assert((Flags & RF_IgnoreMissingEntries) &&
+        assert((Flags & RF_IgnoreMissingLocals) &&
                "Referenced block not in value map!");
     }
   }
@@ -462,11 +881,11 @@ void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap,
   I->getAllMetadata(MDs);
   for (const auto &MI : MDs) {
     MDNode *Old = MI.second;
-    MDNode *New = MapMetadata(Old, VMap, Flags, TypeMapper, Materializer);
+    MDNode *New = cast_or_null<MDNode>(mapMetadata(Old));
     if (New != Old)
       I->setMetadata(MI.first, New);
   }
-  
+
   if (!TypeMapper)
     return;
 
@@ -491,3 +910,213 @@ void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap,
   }
   I->mutateType(TypeMapper->remapType(I->getType()));
 }
+
+void Mapper::remapFunction(Function &F) {
+  // Remap the operands.
+  for (Use &Op : F.operands())
+    if (Op)
+      Op = mapValue(Op);
+
+  // Remap the metadata attachments.
+  SmallVector<std::pair<unsigned, MDNode *>, 8> MDs;
+  F.getAllMetadata(MDs);
+  F.clearMetadata();
+  for (const auto &I : MDs)
+    F.addMetadata(I.first, *cast<MDNode>(mapMetadata(I.second)));
+
+  // Remap the argument types.
+  if (TypeMapper)
+    for (Argument &A : F.args())
+      A.mutateType(TypeMapper->remapType(A.getType()));
+
+  // Remap the instructions.
+  for (BasicBlock &BB : F)
+    for (Instruction &I : BB)
+      remapInstruction(&I);
+}
+
+void Mapper::mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
+                                  bool IsOldCtorDtor,
+                                  ArrayRef<Constant *> NewMembers) {
+  SmallVector<Constant *, 16> Elements;
+  if (InitPrefix) {
+    unsigned NumElements =
+        cast<ArrayType>(InitPrefix->getType())->getNumElements();
+    for (unsigned I = 0; I != NumElements; ++I)
+      Elements.push_back(InitPrefix->getAggregateElement(I));
+  }
+
+  PointerType *VoidPtrTy;
+  Type *EltTy;
+  if (IsOldCtorDtor) {
+    // FIXME: This upgrade is done during linking to support the C API.  See
+    // also IRLinker::linkAppendingVarProto() in IRMover.cpp.
+    VoidPtrTy = Type::getInt8Ty(GV.getContext())->getPointerTo();
+    auto &ST = *cast<StructType>(NewMembers.front()->getType());
+    Type *Tys[3] = {ST.getElementType(0), ST.getElementType(1), VoidPtrTy};
+    EltTy = StructType::get(GV.getContext(), Tys, false);
+  }
+
+  for (auto *V : NewMembers) {
+    Constant *NewV;
+    if (IsOldCtorDtor) {
+      auto *S = cast<ConstantStruct>(V);
+      auto *E1 = mapValue(S->getOperand(0));
+      auto *E2 = mapValue(S->getOperand(1));
+      Value *Null = Constant::getNullValue(VoidPtrTy);
+      NewV =
+          ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null, nullptr);
+    } else {
+      NewV = cast_or_null<Constant>(mapValue(V));
+    }
+    Elements.push_back(NewV);
+  }
+
+  GV.setInitializer(ConstantArray::get(
+      cast<ArrayType>(GV.getType()->getElementType()), Elements));
+}
+
+void Mapper::scheduleMapGlobalInitializer(GlobalVariable &GV, Constant &Init,
+                                          unsigned MCID) {
+  assert(AlreadyScheduled.insert(&GV).second && "Should not reschedule");
+  assert(MCID < MCs.size() && "Invalid mapping context");
+
+  WorklistEntry WE;
+  WE.Kind = WorklistEntry::MapGlobalInit;
+  WE.MCID = MCID;
+  WE.Data.GVInit.GV = &GV;
+  WE.Data.GVInit.Init = &Init;
+  Worklist.push_back(WE);
+}
+
+void Mapper::scheduleMapAppendingVariable(GlobalVariable &GV,
+                                          Constant *InitPrefix,
+                                          bool IsOldCtorDtor,
+                                          ArrayRef<Constant *> NewMembers,
+                                          unsigned MCID) {
+  assert(AlreadyScheduled.insert(&GV).second && "Should not reschedule");
+  assert(MCID < MCs.size() && "Invalid mapping context");
+
+  WorklistEntry WE;
+  WE.Kind = WorklistEntry::MapAppendingVar;
+  WE.MCID = MCID;
+  WE.Data.AppendingGV.GV = &GV;
+  WE.Data.AppendingGV.InitPrefix = InitPrefix;
+  WE.AppendingGVIsOldCtorDtor = IsOldCtorDtor;
+  WE.AppendingGVNumNewMembers = NewMembers.size();
+  Worklist.push_back(WE);
+  AppendingInits.append(NewMembers.begin(), NewMembers.end());
+}
+
+void Mapper::scheduleMapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee,
+                                      unsigned MCID) {
+  assert(AlreadyScheduled.insert(&GA).second && "Should not reschedule");
+  assert(MCID < MCs.size() && "Invalid mapping context");
+
+  WorklistEntry WE;
+  WE.Kind = WorklistEntry::MapGlobalAliasee;
+  WE.MCID = MCID;
+  WE.Data.GlobalAliasee.GA = &GA;
+  WE.Data.GlobalAliasee.Aliasee = &Aliasee;
+  Worklist.push_back(WE);
+}
+
+void Mapper::scheduleRemapFunction(Function &F, unsigned MCID) {
+  assert(AlreadyScheduled.insert(&F).second && "Should not reschedule");
+  assert(MCID < MCs.size() && "Invalid mapping context");
+
+  WorklistEntry WE;
+  WE.Kind = WorklistEntry::RemapFunction;
+  WE.MCID = MCID;
+  WE.Data.RemapF = &F;
+  Worklist.push_back(WE);
+}
+
+void Mapper::addFlags(RemapFlags Flags) {
+  assert(!hasWorkToDo() && "Expected to have flushed the worklist");
+  this->Flags = this->Flags | Flags;
+}
+
+static Mapper *getAsMapper(void *pImpl) {
+  return reinterpret_cast<Mapper *>(pImpl);
+}
+
+namespace {
+
+class FlushingMapper {
+  Mapper &M;
+
+public:
+  explicit FlushingMapper(void *pImpl) : M(*getAsMapper(pImpl)) {
+    assert(!M.hasWorkToDo() && "Expected to be flushed");
+  }
+  ~FlushingMapper() { M.flush(); }
+  Mapper *operator->() const { return &M; }
+};
+
+} // end namespace
+
+ValueMapper::ValueMapper(ValueToValueMapTy &VM, RemapFlags Flags,
+                         ValueMapTypeRemapper *TypeMapper,
+                         ValueMaterializer *Materializer)
+    : pImpl(new Mapper(VM, Flags, TypeMapper, Materializer)) {}
+
+ValueMapper::~ValueMapper() { delete getAsMapper(pImpl); }
+
+unsigned
+ValueMapper::registerAlternateMappingContext(ValueToValueMapTy &VM,
+                                             ValueMaterializer *Materializer) {
+  return getAsMapper(pImpl)->registerAlternateMappingContext(VM, Materializer);
+}
+
+void ValueMapper::addFlags(RemapFlags Flags) {
+  FlushingMapper(pImpl)->addFlags(Flags);
+}
+
+Value *ValueMapper::mapValue(const Value &V) {
+  return FlushingMapper(pImpl)->mapValue(&V);
+}
+
+Constant *ValueMapper::mapConstant(const Constant &C) {
+  return cast_or_null<Constant>(mapValue(C));
+}
+
+Metadata *ValueMapper::mapMetadata(const Metadata &MD) {
+  return FlushingMapper(pImpl)->mapMetadata(&MD);
+}
+
+MDNode *ValueMapper::mapMDNode(const MDNode &N) {
+  return cast_or_null<MDNode>(mapMetadata(N));
+}
+
+void ValueMapper::remapInstruction(Instruction &I) {
+  FlushingMapper(pImpl)->remapInstruction(&I);
+}
+
+void ValueMapper::remapFunction(Function &F) {
+  FlushingMapper(pImpl)->remapFunction(F);
+}
+
+void ValueMapper::scheduleMapGlobalInitializer(GlobalVariable &GV,
+                                               Constant &Init,
+                                               unsigned MCID) {
+  getAsMapper(pImpl)->scheduleMapGlobalInitializer(GV, Init, MCID);
+}
+
+void ValueMapper::scheduleMapAppendingVariable(GlobalVariable &GV,
+                                               Constant *InitPrefix,
+                                               bool IsOldCtorDtor,
+                                               ArrayRef<Constant *> NewMembers,
+                                               unsigned MCID) {
+  getAsMapper(pImpl)->scheduleMapAppendingVariable(
+      GV, InitPrefix, IsOldCtorDtor, NewMembers, MCID);
+}
+
+void ValueMapper::scheduleMapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee,
+                                           unsigned MCID) {
+  getAsMapper(pImpl)->scheduleMapGlobalAliasee(GA, Aliasee, MCID);
+}
+
+void ValueMapper::scheduleRemapFunction(Function &F, unsigned MCID) {
+  getAsMapper(pImpl)->scheduleRemapFunction(F, MCID);
+}