46 files changed, 5149 insertions, 2239 deletions
diff --git a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
index adc903cab31b..5b467dc9fe12 100644
--- a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -41,8 +41,8 @@ using namespace llvm;
 STATISTIC(NumRemoved, "Number of instructions removed");
 STATISTIC(NumBranchesRemoved, "Number of branch instructions removed");
 
-// This is a tempoary option until we change the interface
-// to this pass based on optimization level.
+// This is a temporary option until we change the interface to this pass based
+// on optimization level.
 static cl::opt<bool> RemoveControlFlowFlag("adce-remove-control-flow",
                                            cl::init(true), cl::Hidden);
 
@@ -110,7 +110,7 @@ class AggressiveDeadCodeElimination {
 
   /// The set of blocks which we have determined whose control
   /// dependence sources must be live and which have not had
-  /// those dependences analyized.
+  /// those dependences analyzed.
   SmallPtrSet<BasicBlock *, 16> NewLiveBlocks;
 
   /// Set up auxiliary data structures for Instructions and BasicBlocks and
@@ -145,7 +145,7 @@ class AggressiveDeadCodeElimination {
   /// was removed.
   bool removeDeadInstructions();
 
-  /// Identify connected sections of the control flow grap which have
+  /// Identify connected sections of the control flow graph which have
   /// dead terminators and rewrite the control flow graph to remove them.
   void updateDeadRegions();
 
@@ -234,7 +234,7 @@ void AggressiveDeadCodeElimination::initialize() {
         return Iter != end() && Iter->second;
       }
     } State;
-    
+
     State.reserve(F.size());
     // Iterate over blocks in depth-first pre-order and
     // treat all edges to a block already seen as loop back edges
@@ -262,25 +262,6 @@ void AggressiveDeadCodeElimination::initialize() {
       continue;
     auto *BB = BBInfo.BB;
     if (!PDT.getNode(BB)) {
-      markLive(BBInfo.Terminator);
-      continue;
-    }
-    for (auto *Succ : successors(BB))
-      if (!PDT.getNode(Succ)) {
-        markLive(BBInfo.Terminator);
-        break;
-      }
-  }
-
-  // Mark blocks live if there is no path from the block to the
-  // return of the function or a successor for which this is true.
-  // This protects IDFCalculator which cannot handle such blocks.
-  for (auto &BBInfoPair : BlockInfo) {
-    auto &BBInfo = BBInfoPair.second;
-    if (BBInfo.terminatorIsLive())
-      continue;
-    auto *BB = BBInfo.BB;
-    if (!PDT.getNode(BB)) {
       DEBUG(dbgs() << "Not post-dominated by return: " << BB->getName()
                    << '\n';);
       markLive(BBInfo.Terminator);
@@ -579,7 +560,7 @@ void AggressiveDeadCodeElimination::updateDeadRegions() {
         PreferredSucc = Info;
     }
     assert((PreferredSucc && PreferredSucc->PostOrder > 0) &&
-           "Failed to find safe successor for dead branc");
+           "Failed to find safe successor for dead branch");
     bool First = true;
     for (auto *Succ : successors(BB)) {
       if (!First || Succ != PreferredSucc->BB)
@@ -594,13 +575,13 @@ void AggressiveDeadCodeElimination::updateDeadRegions() {
 
 // reverse top-sort order
 void AggressiveDeadCodeElimination::computeReversePostOrder() {
-  
-  // This provides a post-order numbering of the reverse conrtol flow graph
+
+  // This provides a post-order numbering of the reverse control flow graph
   // Note that it is incomplete in the presence of infinite loops but we don't
   // need numbers blocks which don't reach the end of the functions since
   // all branches in those blocks are forced live.
-  
-  // For each block without successors, extend the DFS from the bloack
+
+  // For each block without successors, extend the DFS from the block
   // backward through the graph
   SmallPtrSet<BasicBlock*, 16> Visited;
   unsigned PostOrder = 0;
@@ -644,8 +625,8 @@ PreservedAnalyses ADCEPass::run(Function &F, FunctionAnalysisManager &FAM) {
   if (!AggressiveDeadCodeElimination(F, PDT).performDeadCodeElimination())
     return PreservedAnalyses::all();
 
-  // FIXME: This should also 'preserve the CFG'.
-  auto PA = PreservedAnalyses();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
   return PA;
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
index c1df3173c0fc..fd931c521c8f 100644
--- a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -438,19 +438,13 @@ AlignmentFromAssumptionsPass::run(Function &F, FunctionAnalysisManager &AM) {
   AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
   ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
   DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
-  bool Changed = runImpl(F, AC, &SE, &DT);
-
-  // FIXME: We need to invalidate this to avoid PR28400. Is there a better
-  // solution?
-  AM.invalidate<ScalarEvolutionAnalysis>(F);
-
-  if (!Changed)
+  if (!runImpl(F, AC, &SE, &DT))
     return PreservedAnalyses::all();
+
   PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<AAManager>();
   PA.preserve<ScalarEvolutionAnalysis>();
   PA.preserve<GlobalsAA>();
-  PA.preserve<LoopAnalysis>();
-  PA.preserve<DominatorTreeAnalysis>();
   return PA;
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
index 251b38707769..61e8700f1cd6 100644
--- a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
@@ -80,8 +80,8 @@ PreservedAnalyses BDCEPass::run(Function &F, FunctionAnalysisManager &AM) {
   if (!bitTrackingDCE(F, DB))
     return PreservedAnalyses::all();
 
-  // FIXME: This should also 'preserve the CFG'.
-  auto PA = PreservedAnalyses();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
   return PA;
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
index 38262514c9ec..ee6333e88716 100644
--- a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -136,8 +136,16 @@ Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
   if (Idx != ~0U && isa<PHINode>(Inst))
     return cast<PHINode>(Inst)->getIncomingBlock(Idx)->getTerminator();
 
-  BasicBlock *IDom = DT->getNode(Inst->getParent())->getIDom()->getBlock();
-  return IDom->getTerminator();
+  // This must be an EH pad. Iterate over immediate dominators until we find a
+  // non-EH pad. We need to skip over catchswitch blocks, which are both EH pads
+  // and terminators.
+  auto IDom = DT->getNode(Inst->getParent())->getIDom();
+  while (IDom->getBlock()->isEHPad()) {
+    assert(Entry != IDom->getBlock() && "eh pad in entry block");
+    IDom = IDom->getIDom();
+  }
+
+  return IDom->getBlock()->getTerminator();
 }
 
 /// \brief Find an insertion point that dominates all uses.
@@ -289,8 +297,8 @@ void ConstantHoistingPass::collectConstantCandidates(Function &Fn) {
 // bit widths (APInt Operator- does not like that). If the value cannot be
 // represented in uint64 we return an "empty" APInt. This is then interpreted
 // as the value is not in range.
-static llvm::Optional<APInt> calculateOffsetDiff(APInt V1, APInt V2)
-{
+static llvm::Optional<APInt> calculateOffsetDiff(const APInt &V1,
+                                                 const APInt &V2) {
   llvm::Optional<APInt> Res = None;
   unsigned BW = V1.getBitWidth() > V2.getBitWidth() ?
                 V1.getBitWidth() : V2.getBitWidth();
@@ -623,6 +631,7 @@ PreservedAnalyses ConstantHoistingPass::run(Function &F,
   if (!runImpl(F, TTI, DT, F.getEntryBlock()))
     return PreservedAnalyses::all();
 
-  // FIXME: This should also 'preserve the CFG'.
-  return PreservedAnalyses::none();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 84f9373ae914..c843c61ea94e 100644
--- a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -235,9 +235,8 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
   // Analyse each switch case in turn.  This is done in reverse order so that
   // removing a case doesn't cause trouble for the iteration.
   bool Changed = false;
-  for (SwitchInst::CaseIt CI = SI->case_end(), CE = SI->case_begin(); CI-- != CE;
-       ) {
-    ConstantInt *Case = CI.getCaseValue();
+  for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) {
+    ConstantInt *Case = CI->getCaseValue();
 
     // Check to see if the switch condition is equal to/not equal to the case
     // value on every incoming edge, equal/not equal being the same each time.
@@ -270,8 +269,9 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
 
     if (State == LazyValueInfo::False) {
       // This case never fires - remove it.
-      CI.getCaseSuccessor()->removePredecessor(BB);
-      SI->removeCase(CI); // Does not invalidate the iterator.
+      CI->getCaseSuccessor()->removePredecessor(BB);
+      CI = SI->removeCase(CI);
+      CE = SI->case_end();
 
       // The condition can be modified by removePredecessor's PHI simplification
       // logic.
@@ -279,7 +279,9 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
 
       ++NumDeadCases;
       Changed = true;
-    } else if (State == LazyValueInfo::True) {
+      continue;
+    }
+    if (State == LazyValueInfo::True) {
       // This case always fires.  Arrange for the switch to be turned into an
       // unconditional branch by replacing the switch condition with the case
       // value.
@@ -288,6 +290,9 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
       Changed = true;
       break;
     }
+
+    // Increment the case iterator sense we didn't delete it.
+    ++CI;
   }
 
   if (Changed)
@@ -308,7 +313,7 @@ static bool processCallSite(CallSite CS, LazyValueInfo *LVI) {
     // Try to mark pointer typed parameters as non-null.  We skip the
     // relatively expensive analysis for constants which are obviously either
     // null or non-null to start with.
-    if (Type && !CS.paramHasAttr(ArgNo + 1, Attribute::NonNull) &&
+    if (Type && !CS.paramHasAttr(ArgNo, Attribute::NonNull) &&
         !isa<Constant>(V) && 
         LVI->getPredicateAt(ICmpInst::ICMP_EQ, V,
                             ConstantPointerNull::get(Type),
@@ -322,7 +327,7 @@ static bool processCallSite(CallSite CS, LazyValueInfo *LVI) {
   if (Indices.empty())
     return false;
 
-  AttributeSet AS = CS.getAttributes();
+  AttributeList AS = CS.getAttributes();
   LLVMContext &Ctx = CS.getInstruction()->getContext();
   AS = AS.addAttribute(Ctx, Indices, Attribute::get(Ctx, Attribute::NonNull));
   CS.setAttributes(AS);
@@ -570,10 +575,6 @@ CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) {
   LazyValueInfo *LVI = &AM.getResult<LazyValueAnalysis>(F);
   bool Changed = runImpl(F, LVI);
 
-  // FIXME: We need to invalidate LVI to avoid PR28400. Is there a better
-  // solution?
-  AM.invalidate<LazyValueAnalysis>(F);
-
   if (!Changed)
     return PreservedAnalyses::all();
   PreservedAnalyses PA;
diff --git a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
index cc2a3cfaf9d1..07a0ba9b1222 100644
--- a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
@@ -124,9 +124,12 @@ static bool eliminateDeadCode(Function &F, TargetLibraryInfo *TLI) {
 }
 
 PreservedAnalyses DCEPass::run(Function &F, FunctionAnalysisManager &AM) {
-  if (eliminateDeadCode(F, AM.getCachedResult<TargetLibraryAnalysis>(F)))
-    return PreservedAnalyses::none();
-  return PreservedAnalyses::all();
+  if (!eliminateDeadCode(F, AM.getCachedResult<TargetLibraryAnalysis>(F)))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
 }
 
 namespace {
diff --git a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 4d4c3baef3f5..1ec38e56aa4c 100644
--- a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -135,13 +135,13 @@ static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) {
   if (auto CS = CallSite(I)) {
     if (Function *F = CS.getCalledFunction()) {
       StringRef FnName = F->getName();
-      if (TLI.has(LibFunc::strcpy) && FnName == TLI.getName(LibFunc::strcpy))
+      if (TLI.has(LibFunc_strcpy) && FnName == TLI.getName(LibFunc_strcpy))
         return true;
-      if (TLI.has(LibFunc::strncpy) && FnName == TLI.getName(LibFunc::strncpy))
+      if (TLI.has(LibFunc_strncpy) && FnName == TLI.getName(LibFunc_strncpy))
         return true;
-      if (TLI.has(LibFunc::strcat) && FnName == TLI.getName(LibFunc::strcat))
+      if (TLI.has(LibFunc_strcat) && FnName == TLI.getName(LibFunc_strcat))
         return true;
-      if (TLI.has(LibFunc::strncat) && FnName == TLI.getName(LibFunc::strncat))
+      if (TLI.has(LibFunc_strncat) && FnName == TLI.getName(LibFunc_strncat))
         return true;
     }
   }
@@ -287,19 +287,14 @@ static uint64_t getPointerSize(const Value *V, const DataLayout &DL,
 }
 
 namespace {
-enum OverwriteResult {
-  OverwriteBegin,
-  OverwriteComplete,
-  OverwriteEnd,
-  OverwriteUnknown
-};
+enum OverwriteResult { OW_Begin, OW_Complete, OW_End, OW_Unknown };
 }
 
-/// Return 'OverwriteComplete' if a store to the 'Later' location completely
-/// overwrites a store to the 'Earlier' location, 'OverwriteEnd' if the end of
-/// the 'Earlier' location is completely overwritten by 'Later',
-/// 'OverwriteBegin' if the beginning of the 'Earlier' location is overwritten
-/// by 'Later', or 'OverwriteUnknown' if nothing can be determined.
+/// Return 'OW_Complete' if a store to the 'Later' location completely
+/// overwrites a store to the 'Earlier' location, 'OW_End' if the end of the
+/// 'Earlier' location is completely overwritten by 'Later', 'OW_Begin' if the
+/// beginning of the 'Earlier' location is overwritten by 'Later', or
+/// 'OW_Unknown' if nothing can be determined.
 static OverwriteResult isOverwrite(const MemoryLocation &Later,
                                    const MemoryLocation &Earlier,
                                    const DataLayout &DL,
@@ -310,7 +305,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
   // If we don't know the sizes of either access, then we can't do a comparison.
   if (Later.Size == MemoryLocation::UnknownSize ||
       Earlier.Size == MemoryLocation::UnknownSize)
-    return OverwriteUnknown;
+    return OW_Unknown;
 
   const Value *P1 = Earlier.Ptr->stripPointerCasts();
   const Value *P2 = Later.Ptr->stripPointerCasts();
@@ -320,7 +315,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
   if (P1 == P2) {
     // Make sure that the Later size is >= the Earlier size.
     if (Later.Size >= Earlier.Size)
-      return OverwriteComplete;
+      return OW_Complete;
   }
 
   // Check to see if the later store is to the entire object (either a global,
@@ -332,13 +327,13 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
   // If we can't resolve the same pointers to the same object, then we can't
   // analyze them at all.
   if (UO1 != UO2)
-    return OverwriteUnknown;
+    return OW_Unknown;
 
   // If the "Later" store is to a recognizable object, get its size.
   uint64_t ObjectSize = getPointerSize(UO2, DL, TLI);
   if (ObjectSize != MemoryLocation::UnknownSize)
     if (ObjectSize == Later.Size && ObjectSize >= Earlier.Size)
-      return OverwriteComplete;
+      return OW_Complete;
 
   // Okay, we have stores to two completely different pointers.  Try to
   // decompose the pointer into a "base + constant_offset" form.  If the base
@@ -350,7 +345,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
 
   // If the base pointers still differ, we have two completely different stores.
   if (BP1 != BP2)
-    return OverwriteUnknown;
+    return OW_Unknown;
 
   // The later store completely overlaps the earlier store if:
   //
@@ -370,7 +365,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
   if (EarlierOff >= LaterOff &&
       Later.Size >= Earlier.Size &&
       uint64_t(EarlierOff - LaterOff) + Earlier.Size <= Later.Size)
-    return OverwriteComplete;
+    return OW_Complete;
 
   // We may now overlap, although the overlap is not complete. There might also
   // be other incomplete overlaps, and together, they might cover the complete
@@ -428,7 +423,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
                       ") Composite Later [" <<
                       ILI->second << ", " << ILI->first << ")\n");
       ++NumCompletePartials;
-      return OverwriteComplete;
+      return OW_Complete;
     }
   }
 
@@ -443,7 +438,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
   if (!EnablePartialOverwriteTracking &&
       (LaterOff > EarlierOff && LaterOff < int64_t(EarlierOff + Earlier.Size) &&
        int64_t(LaterOff + Later.Size) >= int64_t(EarlierOff + Earlier.Size)))
-    return OverwriteEnd;
+    return OW_End;
 
   // Finally, we also need to check if the later store overwrites the beginning
   // of the earlier store.
@@ -458,11 +453,11 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
       (LaterOff <= EarlierOff && int64_t(LaterOff + Later.Size) > EarlierOff)) {
     assert(int64_t(LaterOff + Later.Size) <
                int64_t(EarlierOff + Earlier.Size) &&
-           "Expect to be handled as OverwriteComplete");
-    return OverwriteBegin;
+           "Expect to be handled as OW_Complete");
+    return OW_Begin;
   }
   // Otherwise, they don't completely overlap.
-  return OverwriteUnknown;
+  return OW_Unknown;
 }
 
 /// If 'Inst' might be a self read (i.e. a noop copy of a
@@ -551,7 +546,7 @@ static bool memoryIsNotModifiedBetween(Instruction *FirstI,
       Instruction *I = &*BI;
       if (I->mayWriteToMemory() && I != SecondI) {
         auto Res = AA->getModRefInfo(I, MemLoc);
-        if (Res != MRI_NoModRef)
+        if (Res & MRI_Mod)
           return false;
       }
     }
@@ -909,7 +904,7 @@ static bool tryToShortenBegin(Instruction *EarlierWrite,
 
   if (LaterStart <= EarlierStart && LaterStart + LaterSize > EarlierStart) {
     assert(LaterStart + LaterSize < EarlierStart + EarlierSize &&
-           "Should have been handled as OverwriteComplete");
+           "Should have been handled as OW_Complete");
     if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
                      LaterSize, false)) {
       IntervalMap.erase(OII);
@@ -1105,7 +1100,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
         OverwriteResult OR =
             isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset,
                         DepWrite, IOL);
-        if (OR == OverwriteComplete) {
+        if (OR == OW_Complete) {
           DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: "
                 << *DepWrite << "\n  KILLER: " << *Inst << '\n');
 
@@ -1117,15 +1112,15 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
           // We erased DepWrite; start over.
           InstDep = MD->getDependency(Inst);
           continue;
-        } else if ((OR == OverwriteEnd && isShortenableAtTheEnd(DepWrite)) ||
-                   ((OR == OverwriteBegin &&
+        } else if ((OR == OW_End && isShortenableAtTheEnd(DepWrite)) ||
+                   ((OR == OW_Begin &&
                      isShortenableAtTheBeginning(DepWrite)))) {
           assert(!EnablePartialOverwriteTracking && "Do not expect to perform "
                                                     "when partial-overwrite "
                                                     "tracking is enabled");
           int64_t EarlierSize = DepLoc.Size;
           int64_t LaterSize = Loc.Size;
-          bool IsOverwriteEnd = (OR == OverwriteEnd);
+          bool IsOverwriteEnd = (OR == OW_End);
           MadeChange |= tryToShorten(DepWrite, DepWriteOffset, EarlierSize,
                                     InstWriteOffset, LaterSize, IsOverwriteEnd);
         }
@@ -1186,8 +1181,9 @@ PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
 
   if (!eliminateDeadStores(F, AA, MD, DT, TLI))
     return PreservedAnalyses::all();
+
   PreservedAnalyses PA;
-  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
   PA.preserve<MemoryDependenceAnalysis>();
   return PA;
diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index 16e08ee58fbe..04479b6e49ac 100644
--- a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -19,6 +19,8 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/DataLayout.h"
@@ -32,7 +34,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/MemorySSA.h"
 #include <deque>
 using namespace llvm;
 using namespace llvm::PatternMatch;
@@ -253,6 +254,7 @@ public:
   DominatorTree &DT;
   AssumptionCache &AC;
   MemorySSA *MSSA;
+  std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
   typedef RecyclingAllocator<
       BumpPtrAllocator, ScopedHashTableVal<SimpleValue, Value *>> AllocatorTy;
   typedef ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>,
@@ -315,7 +317,9 @@ public:
   /// \brief Set up the EarlyCSE runner for a particular function.
   EarlyCSE(const TargetLibraryInfo &TLI, const TargetTransformInfo &TTI,
            DominatorTree &DT, AssumptionCache &AC, MemorySSA *MSSA)
-      : TLI(TLI), TTI(TTI), DT(DT), AC(AC), MSSA(MSSA), CurrentGeneration(0) {}
+      : TLI(TLI), TTI(TTI), DT(DT), AC(AC), MSSA(MSSA),
+        MSSAUpdater(make_unique<MemorySSAUpdater>(MSSA)), CurrentGeneration(0) {
+  }
 
   bool run();
 
@@ -388,7 +392,7 @@ private:
     ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI)
       : IsTargetMemInst(false), Inst(Inst) {
       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
-        if (TTI.getTgtMemIntrinsic(II, Info) && Info.NumMemRefs == 1)
+        if (TTI.getTgtMemIntrinsic(II, Info))
           IsTargetMemInst = true;
     }
     bool isLoad() const {
@@ -400,17 +404,14 @@ private:
       return isa<StoreInst>(Inst);
     }
     bool isAtomic() const {
-      if (IsTargetMemInst) {
-        assert(Info.IsSimple && "need to refine IsSimple in TTI");
-        return false;
-      }
+      if (IsTargetMemInst)
+        return Info.Ordering != AtomicOrdering::NotAtomic;
       return Inst->isAtomic();
     }
     bool isUnordered() const {
-      if (IsTargetMemInst) {
-        assert(Info.IsSimple && "need to refine IsSimple in TTI");
-        return true;
-      }
+      if (IsTargetMemInst)
+        return Info.isUnordered();
+
       if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
         return LI->isUnordered();
       } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
@@ -421,10 +422,9 @@ private:
     }
 
     bool isVolatile() const {
-      if (IsTargetMemInst) {
-        assert(Info.IsSimple && "need to refine IsSimple in TTI");
-        return false;
-      }
+      if (IsTargetMemInst)
+        return Info.IsVolatile;
+
       if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
         return LI->isVolatile();
       } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
@@ -517,7 +517,7 @@ private:
           if (MemoryPhi *MP = dyn_cast<MemoryPhi>(U))
             PhisToCheck.push_back(MP);
 
-        MSSA->removeMemoryAccess(WI);
+        MSSAUpdater->removeMemoryAccess(WI);
 
         for (MemoryPhi *MP : PhisToCheck) {
           MemoryAccess *FirstIn = MP->getIncomingValue(0);
@@ -587,27 +587,28 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
   // which reaches this block where the condition might hold a different
   // value.  Since we're adding this to the scoped hash table (like any other
   // def), it will have been popped if we encounter a future merge block.
-  if (BasicBlock *Pred = BB->getSinglePredecessor())
-    if (auto *BI = dyn_cast<BranchInst>(Pred->getTerminator()))
-      if (BI->isConditional())
-        if (auto *CondInst = dyn_cast<Instruction>(BI->getCondition()))
-          if (SimpleValue::canHandle(CondInst)) {
-            assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB);
-            auto *ConditionalConstant = (BI->getSuccessor(0) == BB) ?
-              ConstantInt::getTrue(BB->getContext()) :
-              ConstantInt::getFalse(BB->getContext());
-            AvailableValues.insert(CondInst, ConditionalConstant);
-            DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '"
-                  << CondInst->getName() << "' as " << *ConditionalConstant
-                  << " in " << BB->getName() << "\n");
-            // Replace all dominated uses with the known value.
-            if (unsigned Count =
-                    replaceDominatedUsesWith(CondInst, ConditionalConstant, DT,
-                                             BasicBlockEdge(Pred, BB))) {
-              Changed = true;
-              NumCSECVP = NumCSECVP + Count;
-            }
-          }
+  if (BasicBlock *Pred = BB->getSinglePredecessor()) {
+    auto *BI = dyn_cast<BranchInst>(Pred->getTerminator());
+    if (BI && BI->isConditional()) {
+      auto *CondInst = dyn_cast<Instruction>(BI->getCondition());
+      if (CondInst && SimpleValue::canHandle(CondInst)) {
+        assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB);
+        auto *TorF = (BI->getSuccessor(0) == BB)
+                         ? ConstantInt::getTrue(BB->getContext())
+                         : ConstantInt::getFalse(BB->getContext());
+        AvailableValues.insert(CondInst, TorF);
+        DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '"
+                     << CondInst->getName() << "' as " << *TorF << " in "
+                     << BB->getName() << "\n");
+        // Replace all dominated uses with the known value.
+        if (unsigned Count = replaceDominatedUsesWith(
+                CondInst, TorF, DT, BasicBlockEdge(Pred, BB))) {
+          Changed = true;
+          NumCSECVP = NumCSECVP + Count;
+        }
+      }
+    }
+  }
 
   /// LastStore - Keep track of the last non-volatile store that we saw... for
   /// as long as there in no instruction that reads memory.  If we see a store
@@ -761,12 +762,13 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       continue;
     }
 
-    // If this instruction may read from memory, forget LastStore.
-    // Load/store intrinsics will indicate both a read and a write to
-    // memory.  The target may override this (e.g. so that a store intrinsic
-    // does not read  from memory, and thus will be treated the same as a
-    // regular store for commoning purposes).
-    if (Inst->mayReadFromMemory() &&
+    // If this instruction may read from memory or throw (and potentially read
+    // from memory in the exception handler), forget LastStore.  Load/store
+    // intrinsics will indicate both a read and a write to memory.  The target
+    // may override this (e.g. so that a store intrinsic does not read from
+    // memory, and thus will be treated the same as a regular store for
+    // commoning purposes).
+    if ((Inst->mayReadFromMemory() || Inst->mayThrow()) &&
         !(MemInst.isValid() && !MemInst.mayReadFromMemory()))
       LastStore = nullptr;
 
@@ -967,10 +969,8 @@ PreservedAnalyses EarlyCSEPass::run(Function &F,
   if (!CSE.run())
     return PreservedAnalyses::all();
 
-  // CSE preserves the dominator tree because it doesn't mutate the CFG.
-  // FIXME: Bundle this with other CFG-preservation.
   PreservedAnalyses PA;
-  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
   if (UseMemorySSA)
     PA.preserve<MemorySSAAnalysis>();
diff --git a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
index 545036d724ef..8a5af6195f1b 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
@@ -516,11 +516,10 @@ FunctionPass *createFloat2IntPass() { return new Float2IntLegacyPass(); }
 PreservedAnalyses Float2IntPass::run(Function &F, FunctionAnalysisManager &) {
   if (!runImpl(F))
     return PreservedAnalyses::all();
-  else {
-    // FIXME: This should also 'preserve the CFG'.
-    PreservedAnalyses PA;
-    PA.preserve<GlobalsAA>();
-    return PA;
-  }
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<GlobalsAA>();
+  return PA;
 }
 } // End namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
index 0137378b828b..be696df548d5 100644
--- a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -36,7 +36,6 @@
 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/PHITransAddr.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -51,9 +50,12 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/VNCoercion.h"
+
 #include <vector>
 using namespace llvm;
 using namespace llvm::gvn;
+using namespace llvm::VNCoercion;
 using namespace PatternMatch;
 
 #define DEBUG_TYPE "gvn"
@@ -595,11 +597,12 @@ PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) {
   PreservedAnalyses PA;
   PA.preserve<DominatorTreeAnalysis>();
   PA.preserve<GlobalsAA>();
+  PA.preserve<TargetLibraryAnalysis>();
   return PA;
 }
 
-LLVM_DUMP_METHOD
-void GVN::dump(DenseMap<uint32_t, Value*>& d) {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void GVN::dump(DenseMap<uint32_t, Value*>& d) {
   errs() << "{\n";
   for (DenseMap<uint32_t, Value*>::iterator I = d.begin(),
        E = d.end(); I != E; ++I) {
@@ -608,6 +611,7 @@ void GVN::dump(DenseMap<uint32_t, Value*>& d) {
   }
   errs() << "}\n";
 }
+#endif
 
 /// Return true if we can prove that the value
 /// we're analyzing is fully available in the specified block.  As we go, keep
@@ -690,442 +694,6 @@ SpeculationFailure:
 }
 
 
-/// Return true if CoerceAvailableValueToLoadType will succeed.
-static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
-                                            Type *LoadTy,
-                                            const DataLayout &DL) {
-  // If the loaded or stored value is an first class array or struct, don't try
-  // to transform them.  We need to be able to bitcast to integer.
-  if (LoadTy->isStructTy() || LoadTy->isArrayTy() ||
-      StoredVal->getType()->isStructTy() ||
-      StoredVal->getType()->isArrayTy())
-    return false;
-
-  // The store has to be at least as big as the load.
-  if (DL.getTypeSizeInBits(StoredVal->getType()) <
-        DL.getTypeSizeInBits(LoadTy))
-    return false;
-
-  return true;
-}
-
-/// If we saw a store of a value to memory, and
-/// then a load from a must-aliased pointer of a different type, try to coerce
-/// the stored value.  LoadedTy is the type of the load we want to replace.
-/// IRB is IRBuilder used to insert new instructions.
-///
-/// If we can't do it, return null.
-static Value *CoerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
-                                             IRBuilder<> &IRB,
-                                             const DataLayout &DL) {
-  assert(CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL) &&
-         "precondition violation - materialization can't fail");
-
-  if (auto *C = dyn_cast<Constant>(StoredVal))
-    if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
-      StoredVal = FoldedStoredVal;
-
-  // If this is already the right type, just return it.
-  Type *StoredValTy = StoredVal->getType();
-
-  uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy);
-  uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy);
-
-  // If the store and reload are the same size, we can always reuse it.
-  if (StoredValSize == LoadedValSize) {
-    // Pointer to Pointer -> use bitcast.
-    if (StoredValTy->getScalarType()->isPointerTy() &&
-        LoadedTy->getScalarType()->isPointerTy()) {
-      StoredVal = IRB.CreateBitCast(StoredVal, LoadedTy);
-    } else {
-      // Convert source pointers to integers, which can be bitcast.
-      if (StoredValTy->getScalarType()->isPointerTy()) {
-        StoredValTy = DL.getIntPtrType(StoredValTy);
-        StoredVal = IRB.CreatePtrToInt(StoredVal, StoredValTy);
-      }
-
-      Type *TypeToCastTo = LoadedTy;
-      if (TypeToCastTo->getScalarType()->isPointerTy())
-        TypeToCastTo = DL.getIntPtrType(TypeToCastTo);
-
-      if (StoredValTy != TypeToCastTo)
-        StoredVal = IRB.CreateBitCast(StoredVal, TypeToCastTo);
-
-      // Cast to pointer if the load needs a pointer type.
-      if (LoadedTy->getScalarType()->isPointerTy())
-        StoredVal = IRB.CreateIntToPtr(StoredVal, LoadedTy);
-    }
-
-    if (auto *C = dyn_cast<ConstantExpr>(StoredVal))
-      if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
-        StoredVal = FoldedStoredVal;
-
-    return StoredVal;
-  }
-
-  // If the loaded value is smaller than the available value, then we can
-  // extract out a piece from it.  If the available value is too small, then we
-  // can't do anything.
-  assert(StoredValSize >= LoadedValSize &&
-         "CanCoerceMustAliasedValueToLoad fail");
-
-  // Convert source pointers to integers, which can be manipulated.
-  if (StoredValTy->getScalarType()->isPointerTy()) {
-    StoredValTy = DL.getIntPtrType(StoredValTy);
-    StoredVal = IRB.CreatePtrToInt(StoredVal, StoredValTy);
-  }
-
-  // Convert vectors and fp to integer, which can be manipulated.
-  if (!StoredValTy->isIntegerTy()) {
-    StoredValTy = IntegerType::get(StoredValTy->getContext(), StoredValSize);
-    StoredVal = IRB.CreateBitCast(StoredVal, StoredValTy);
-  }
-
-  // If this is a big-endian system, we need to shift the value down to the low
-  // bits so that a truncate will work.
-  if (DL.isBigEndian()) {
-    uint64_t ShiftAmt = DL.getTypeStoreSizeInBits(StoredValTy) -
-                        DL.getTypeStoreSizeInBits(LoadedTy);
-    StoredVal = IRB.CreateLShr(StoredVal, ShiftAmt, "tmp");
-  }
-
-  // Truncate the integer to the right size now.
-  Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadedValSize);
-  StoredVal  = IRB.CreateTrunc(StoredVal, NewIntTy, "trunc");
-
-  if (LoadedTy != NewIntTy) {
-    // If the result is a pointer, inttoptr.
-    if (LoadedTy->getScalarType()->isPointerTy())
-      StoredVal = IRB.CreateIntToPtr(StoredVal, LoadedTy, "inttoptr");
-    else
-      // Otherwise, bitcast.
-      StoredVal = IRB.CreateBitCast(StoredVal, LoadedTy, "bitcast");
-  }
-
-  if (auto *C = dyn_cast<Constant>(StoredVal))
-    if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
-      StoredVal = FoldedStoredVal;
-
-  return StoredVal;
-}
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being a clobbering memory write (store,
-/// memset, memcpy, memmove).  This means that the write *may* provide bits used
-/// by the load but we can't be sure because the pointers don't mustalias.
-///
-/// Check this case to see if there is anything more we can do before we give
-/// up.  This returns -1 if we have to give up, or a byte number in the stored
-/// value of the piece that feeds the load.
-static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
-                                          Value *WritePtr,
-                                          uint64_t WriteSizeInBits,
-                                          const DataLayout &DL) {
-  // If the loaded or stored value is a first class array or struct, don't try
-  // to transform them.  We need to be able to bitcast to integer.
-  if (LoadTy->isStructTy() || LoadTy->isArrayTy())
-    return -1;
-
-  int64_t StoreOffset = 0, LoadOffset = 0;
-  Value *StoreBase =
-      GetPointerBaseWithConstantOffset(WritePtr, StoreOffset, DL);
-  Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, DL);
-  if (StoreBase != LoadBase)
-    return -1;
-
-  // If the load and store are to the exact same address, they should have been
-  // a must alias.  AA must have gotten confused.
-  // FIXME: Study to see if/when this happens.  One case is forwarding a memset
-  // to a load from the base of the memset.
-
-  // If the load and store don't overlap at all, the store doesn't provide
-  // anything to the load.  In this case, they really don't alias at all, AA
-  // must have gotten confused.
-  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy);
-
-  if ((WriteSizeInBits & 7) | (LoadSize & 7))
-    return -1;
-  uint64_t StoreSize = WriteSizeInBits / 8;  // Convert to bytes.
-  LoadSize /= 8;
-
-
-  bool isAAFailure = false;
-  if (StoreOffset < LoadOffset)
-    isAAFailure = StoreOffset+int64_t(StoreSize) <= LoadOffset;
-  else
-    isAAFailure = LoadOffset+int64_t(LoadSize) <= StoreOffset;
-
-  if (isAAFailure)
-    return -1;
-
-  // If the Load isn't completely contained within the stored bits, we don't
-  // have all the bits to feed it.  We could do something crazy in the future
-  // (issue a smaller load then merge the bits in) but this seems unlikely to be
-  // valuable.
-  if (StoreOffset > LoadOffset ||
-      StoreOffset+StoreSize < LoadOffset+LoadSize)
-    return -1;
-
-  // Okay, we can do this transformation.  Return the number of bytes into the
-  // store that the load is.
-  return LoadOffset-StoreOffset;
-}
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being a clobbering store.
-static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
-                                          StoreInst *DepSI) {
-  // Cannot handle reading from store of first-class aggregate yet.
-  if (DepSI->getValueOperand()->getType()->isStructTy() ||
-      DepSI->getValueOperand()->getType()->isArrayTy())
-    return -1;
-
-  const DataLayout &DL = DepSI->getModule()->getDataLayout();
-  Value *StorePtr = DepSI->getPointerOperand();
-  uint64_t StoreSize =DL.getTypeSizeInBits(DepSI->getValueOperand()->getType());
-  return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr,
-                                        StorePtr, StoreSize, DL);
-}
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being clobbered by another load.  See if
-/// the other load can feed into the second load.
-static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr,
-                                         LoadInst *DepLI, const DataLayout &DL){
-  // Cannot handle reading from store of first-class aggregate yet.
-  if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
-    return -1;
-
-  Value *DepPtr = DepLI->getPointerOperand();
-  uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType());
-  int R = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, DL);
-  if (R != -1) return R;
-
-  // If we have a load/load clobber an DepLI can be widened to cover this load,
-  // then we should widen it!
-  int64_t LoadOffs = 0;
-  const Value *LoadBase =
-      GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL);
-  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
-
-  unsigned Size = MemoryDependenceResults::getLoadLoadClobberFullWidthSize(
-      LoadBase, LoadOffs, LoadSize, DepLI);
-  if (Size == 0) return -1;
-
-  // Check non-obvious conditions enforced by MDA which we rely on for being
-  // able to materialize this potentially available value
-  assert(DepLI->isSimple() && "Cannot widen volatile/atomic load!");
-  assert(DepLI->getType()->isIntegerTy() && "Can't widen non-integer load");
-
-  return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size*8, DL);
-}
-
-
-
-static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
-                                            MemIntrinsic *MI,
-                                            const DataLayout &DL) {
-  // If the mem operation is a non-constant size, we can't handle it.
-  ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength());
-  if (!SizeCst) return -1;
-  uint64_t MemSizeInBits = SizeCst->getZExtValue()*8;
-
-  // If this is memset, we just need to see if the offset is valid in the size
-  // of the memset..
-  if (MI->getIntrinsicID() == Intrinsic::memset)
-    return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
-                                          MemSizeInBits, DL);
-
-  // If we have a memcpy/memmove, the only case we can handle is if this is a
-  // copy from constant memory.  In that case, we can read directly from the
-  // constant memory.
-  MemTransferInst *MTI = cast<MemTransferInst>(MI);
-
-  Constant *Src = dyn_cast<Constant>(MTI->getSource());
-  if (!Src) return -1;
-
-  GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, DL));
-  if (!GV || !GV->isConstant()) return -1;
-
-  // See if the access is within the bounds of the transfer.
-  int Offset = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr,
-                                              MI->getDest(), MemSizeInBits, DL);
-  if (Offset == -1)
-    return Offset;
-
-  unsigned AS = Src->getType()->getPointerAddressSpace();
-  // Otherwise, see if we can constant fold a load from the constant with the
-  // offset applied as appropriate.
-  Src = ConstantExpr::getBitCast(Src,
-                                 Type::getInt8PtrTy(Src->getContext(), AS));
-  Constant *OffsetCst =
-    ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
-  Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
-                                       OffsetCst);
-  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
-  if (ConstantFoldLoadFromConstPtr(Src, LoadTy, DL))
-    return Offset;
-  return -1;
-}
-
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being a clobbering store.  This means
-/// that the store provides bits used by the load but we the pointers don't
-/// mustalias.  Check this case to see if there is anything more we can do
-/// before we give up.
-static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
-                                   Type *LoadTy,
-                                   Instruction *InsertPt, const DataLayout &DL){
-  LLVMContext &Ctx = SrcVal->getType()->getContext();
-
-  uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()) + 7) / 8;
-  uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8;
-
-  IRBuilder<> Builder(InsertPt);
-
-  // Compute which bits of the stored value are being used by the load.  Convert
-  // to an integer type to start with.
-  if (SrcVal->getType()->getScalarType()->isPointerTy())
-    SrcVal = Builder.CreatePtrToInt(SrcVal,
-        DL.getIntPtrType(SrcVal->getType()));
-  if (!SrcVal->getType()->isIntegerTy())
-    SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8));
-
-  // Shift the bits to the least significant depending on endianness.
-  unsigned ShiftAmt;
-  if (DL.isLittleEndian())
-    ShiftAmt = Offset*8;
-  else
-    ShiftAmt = (StoreSize-LoadSize-Offset)*8;
-
-  if (ShiftAmt)
-    SrcVal = Builder.CreateLShr(SrcVal, ShiftAmt);
-
-  if (LoadSize != StoreSize)
-    SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8));
-
-  return CoerceAvailableValueToLoadType(SrcVal, LoadTy, Builder, DL);
-}
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being a clobbering load.  This means
-/// that the load *may* provide bits used by the load but we can't be sure
-/// because the pointers don't mustalias.  Check this case to see if there is
-/// anything more we can do before we give up.
-static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
-                                  Type *LoadTy, Instruction *InsertPt,
-                                  GVN &gvn) {
-  const DataLayout &DL = SrcVal->getModule()->getDataLayout();
-  // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to
-  // widen SrcVal out to a larger load.
-  unsigned SrcValStoreSize = DL.getTypeStoreSize(SrcVal->getType());
-  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
-  if (Offset+LoadSize > SrcValStoreSize) {
-    assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!");
-    assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load");
-    // If we have a load/load clobber an DepLI can be widened to cover this
-    // load, then we should widen it to the next power of 2 size big enough!
-    unsigned NewLoadSize = Offset+LoadSize;
-    if (!isPowerOf2_32(NewLoadSize))
-      NewLoadSize = NextPowerOf2(NewLoadSize);
-
-    Value *PtrVal = SrcVal->getPointerOperand();
-
-    // Insert the new load after the old load.  This ensures that subsequent
-    // memdep queries will find the new load.  We can't easily remove the old
-    // load completely because it is already in the value numbering table.
-    IRBuilder<> Builder(SrcVal->getParent(), ++BasicBlock::iterator(SrcVal));
-    Type *DestPTy =
-      IntegerType::get(LoadTy->getContext(), NewLoadSize*8);
-    DestPTy = PointerType::get(DestPTy,
-                               PtrVal->getType()->getPointerAddressSpace());
-    Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc());
-    PtrVal = Builder.CreateBitCast(PtrVal, DestPTy);
-    LoadInst *NewLoad = Builder.CreateLoad(PtrVal);
-    NewLoad->takeName(SrcVal);
-    NewLoad->setAlignment(SrcVal->getAlignment());
-
-    DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n");
-    DEBUG(dbgs() << "TO: " << *NewLoad << "\n");
-
-    // Replace uses of the original load with the wider load.  On a big endian
-    // system, we need to shift down to get the relevant bits.
-    Value *RV = NewLoad;
-    if (DL.isBigEndian())
-      RV = Builder.CreateLShr(RV, (NewLoadSize - SrcValStoreSize) * 8);
-    RV = Builder.CreateTrunc(RV, SrcVal->getType());
-    SrcVal->replaceAllUsesWith(RV);
-
-    // We would like to use gvn.markInstructionForDeletion here, but we can't
-    // because the load is already memoized into the leader map table that GVN
-    // tracks.  It is potentially possible to remove the load from the table,
-    // but then there all of the operations based on it would need to be
-    // rehashed.  Just leave the dead load around.
-    gvn.getMemDep().removeInstruction(SrcVal);
-    SrcVal = NewLoad;
-  }
-
-  return GetStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, DL);
-}
-
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being a clobbering mem intrinsic.
-static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
-                                     Type *LoadTy, Instruction *InsertPt,
-                                     const DataLayout &DL){
-  LLVMContext &Ctx = LoadTy->getContext();
-  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy)/8;
-
-  IRBuilder<> Builder(InsertPt);
-
-  // We know that this method is only called when the mem transfer fully
-  // provides the bits for the load.
-  if (MemSetInst *MSI = dyn_cast<MemSetInst>(SrcInst)) {
-    // memset(P, 'x', 1234) -> splat('x'), even if x is a variable, and
-    // independently of what the offset is.
-    Value *Val = MSI->getValue();
-    if (LoadSize != 1)
-      Val = Builder.CreateZExt(Val, IntegerType::get(Ctx, LoadSize*8));
-
-    Value *OneElt = Val;
-
-    // Splat the value out to the right number of bits.
-    for (unsigned NumBytesSet = 1; NumBytesSet != LoadSize; ) {
-      // If we can double the number of bytes set, do it.
-      if (NumBytesSet*2 <= LoadSize) {
-        Value *ShVal = Builder.CreateShl(Val, NumBytesSet*8);
-        Val = Builder.CreateOr(Val, ShVal);
-        NumBytesSet <<= 1;
-        continue;
-      }
-
-      // Otherwise insert one byte at a time.
-      Value *ShVal = Builder.CreateShl(Val, 1*8);
-      Val = Builder.CreateOr(OneElt, ShVal);
-      ++NumBytesSet;
-    }
-
-    return CoerceAvailableValueToLoadType(Val, LoadTy, Builder, DL);
-  }
-
-  // Otherwise, this is a memcpy/memmove from a constant global.
-  MemTransferInst *MTI = cast<MemTransferInst>(SrcInst);
-  Constant *Src = cast<Constant>(MTI->getSource());
-  unsigned AS = Src->getType()->getPointerAddressSpace();
-
-  // Otherwise, see if we can constant fold a load from the constant with the
-  // offset applied as appropriate.
-  Src = ConstantExpr::getBitCast(Src,
-                                 Type::getInt8PtrTy(Src->getContext(), AS));
-  Constant *OffsetCst =
-    ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
-  Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
-                                       OffsetCst);
-  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
-  return ConstantFoldLoadFromConstPtr(Src, LoadTy, DL);
-}
 
 
 /// Given a set of loads specified by ValuesPerBlock,
@@ -1171,7 +739,7 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *LI,
   if (isSimpleValue()) {
     Res = getSimpleValue();
     if (Res->getType() != LoadTy) {
-      Res = GetStoreValueForLoad(Res, Offset, LoadTy, InsertPt, DL);
+      Res = getStoreValueForLoad(Res, Offset, LoadTy, InsertPt, DL);
 
       DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << "  "
                    << *getSimpleValue() << '\n'
@@ -1182,14 +750,20 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *LI,
     if (Load->getType() == LoadTy && Offset == 0) {
       Res = Load;
     } else {
-      Res = GetLoadValueForLoad(Load, Offset, LoadTy, InsertPt, gvn);
-
+      Res = getLoadValueForLoad(Load, Offset, LoadTy, InsertPt, DL);
+      // We would like to use gvn.markInstructionForDeletion here, but we can't
+      // because the load is already memoized into the leader map table that GVN
+      // tracks.  It is potentially possible to remove the load from the table,
+      // but then there all of the operations based on it would need to be
+      // rehashed.  Just leave the dead load around.
+      gvn.getMemDep().removeInstruction(Load);
       DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << "  "
                    << *getCoercedLoadValue() << '\n'
-                   << *Res << '\n' << "\n\n\n");
+                   << *Res << '\n'
+                   << "\n\n\n");
     }
   } else if (isMemIntrinValue()) {
-    Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy,
+    Res = getMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy,
                                  InsertPt, DL);
     DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
                  << "  " << *getMemIntrinValue() << '\n'
@@ -1258,7 +832,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
       // Can't forward from non-atomic to atomic without violating memory model.
       if (Address && LI->isAtomic() <= DepSI->isAtomic()) {
         int Offset =
-          AnalyzeLoadFromClobberingStore(LI->getType(), Address, DepSI);
+          analyzeLoadFromClobberingStore(LI->getType(), Address, DepSI, DL);
         if (Offset != -1) {
           Res = AvailableValue::get(DepSI->getValueOperand(), Offset);
           return true;
@@ -1276,7 +850,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
       // Can't forward from non-atomic to atomic without violating memory model.
       if (DepLI != LI && Address && LI->isAtomic() <= DepLI->isAtomic()) {
         int Offset =
-          AnalyzeLoadFromClobberingLoad(LI->getType(), Address, DepLI, DL);
+          analyzeLoadFromClobberingLoad(LI->getType(), Address, DepLI, DL);
 
         if (Offset != -1) {
           Res = AvailableValue::getLoad(DepLI, Offset);
@@ -1289,7 +863,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
     // forward a value on from it.
     if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInfo.getInst())) {
       if (Address && !LI->isAtomic()) {
-        int Offset = AnalyzeLoadFromClobberingMemInst(LI->getType(), Address,
+        int Offset = analyzeLoadFromClobberingMemInst(LI->getType(), Address,
                                                       DepMI, DL);
         if (Offset != -1) {
           Res = AvailableValue::getMI(DepMI, Offset);
@@ -1334,7 +908,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
     // different types if we have to. If the stored value is larger or equal to
     // the loaded value, we can reuse it.
     if (S->getValueOperand()->getType() != LI->getType() &&
-        !CanCoerceMustAliasedValueToLoad(S->getValueOperand(),
+        !canCoerceMustAliasedValueToLoad(S->getValueOperand(),
                                          LI->getType(), DL))
       return false;
 
@@ -1351,7 +925,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
     // If the stored value is larger or equal to the loaded value, we can reuse
     // it.
     if (LD->getType() != LI->getType() &&
-        !CanCoerceMustAliasedValueToLoad(LD, LI->getType(), DL))
+        !canCoerceMustAliasedValueToLoad(LD, LI->getType(), DL))
       return false;
 
     // Can't forward from non-atomic to atomic without violating memory model.
@@ -1713,7 +1287,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
       // If instruction I has debug info, then we should not update it.
       // Also, if I has a null DebugLoc, then it is still potentially incorrect
       // to propagate LI's DebugLoc because LI may not post-dominate I.
-      if (LI->getDebugLoc() && ValuesPerBlock.size() != 1)
+      if (LI->getDebugLoc() && LI->getParent() == I->getParent())
         I->setDebugLoc(LI->getDebugLoc());
     if (V->getType()->getScalarType()->isPointerTy())
       MD->invalidateCachedPointerInfo(V);
@@ -1795,7 +1369,7 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) {
 
   // Patch the replacement so that it is not more restrictive than the value
   // being replaced.
-  // Note that if 'I' is a load being replaced by some operation, 
+  // Note that if 'I' is a load being replaced by some operation,
   // for example, by an arithmetic operation, then andIRFlags()
   // would just erase all math flags from the original arithmetic
   // operation, which is clearly not wanted and not needed.
@@ -2187,11 +1761,11 @@ bool GVN::processInstruction(Instruction *I) {
 
     for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
          i != e; ++i) {
-      BasicBlock *Dst = i.getCaseSuccessor();
+      BasicBlock *Dst = i->getCaseSuccessor();
       // If there is only a single edge, propagate the case value into it.
       if (SwitchEdges.lookup(Dst) == 1) {
         BasicBlockEdge E(Parent, Dst);
-        Changed |= propagateEquality(SwitchCond, i.getCaseValue(), E, true);
+        Changed |= propagateEquality(SwitchCond, i->getCaseValue(), E, true);
       }
     }
     return Changed;
@@ -2581,21 +2155,12 @@ bool GVN::iterateOnFunction(Function &F) {
 
   // Top-down walk of the dominator tree
   bool Changed = false;
-  // Save the blocks this function have before transformation begins. GVN may
-  // split critical edge, and hence may invalidate the RPO/DT iterator.
-  //
-  std::vector<BasicBlock *> BBVect;
-  BBVect.reserve(256);
   // Needed for value numbering with phi construction to work.
+  // RPOT walks the graph in its constructor and will not be invalidated during
+  // processBlock.
   ReversePostOrderTraversal<Function *> RPOT(&F);
-  for (ReversePostOrderTraversal<Function *>::rpo_iterator RI = RPOT.begin(),
-                                                           RE = RPOT.end();
-       RI != RE; ++RI)
-    BBVect.push_back(*RI);
-
-  for (std::vector<BasicBlock *>::iterator I = BBVect.begin(), E = BBVect.end();
-       I != E; I++)
-    Changed |= processBlock(*I);
+  for (BasicBlock *BB : RPOT)
+    Changed |= processBlock(BB);
 
   return Changed;
 }
@@ -2783,6 +2348,7 @@ public:
 
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<TargetLibraryInfoWrapperPass>();
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
   }
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp
index f8e1d2e1a08a..6adfe130d148 100644
--- a/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp
@@ -17,16 +17,39 @@
 // is disabled in the following cases.
 // 1. Scalars across calls.
 // 2. geps when corresponding load/store cannot be hoisted.
+//
+// TODO: Hoist from >2 successors. Currently GVNHoist will not hoist stores
+// in this case because it works on two instructions at a time.
+// entry:
+//   switch i32 %c1, label %exit1 [
+//     i32 0, label %sw0
+//     i32 1, label %sw1
+//   ]
+//
+// sw0:
+//   store i32 1, i32* @G
+//   br label %exit
+//
+// sw1:
+//   store i32 1, i32* @G
+//   br label %exit
+//
+// exit1:
+//   store i32 1, i32* @G
+//   ret void
+// exit:
+//   ret void
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/MemorySSA.h"
 
 using namespace llvm;
 
@@ -60,7 +83,7 @@ static cl::opt<int>
                    cl::desc("Maximum length of dependent chains to hoist "
                             "(default = 10, unlimited = -1)"));
 
-namespace {
+namespace llvm {
 
 // Provides a sorting function based on the execution order of two instructions.
 struct SortByDFSIn {
@@ -72,13 +95,6 @@ public:
 
   // Returns true when A executes before B.
   bool operator()(const Instruction *A, const Instruction *B) const {
-    // FIXME: libc++ has a std::sort() algorithm that will call the compare
-    // function on the same element.  Once PR20837 is fixed and some more years
-    // pass by and all the buildbots have moved to a corrected std::sort(),
-    // enable the following assert:
-    //
-    // assert(A != B);
-
     const BasicBlock *BA = A->getParent();
     const BasicBlock *BB = B->getParent();
     unsigned ADFS, BDFS;
@@ -202,6 +218,7 @@ public:
   GVNHoist(DominatorTree *DT, AliasAnalysis *AA, MemoryDependenceResults *MD,
            MemorySSA *MSSA)
       : DT(DT), AA(AA), MD(MD), MSSA(MSSA),
+        MSSAUpdater(make_unique<MemorySSAUpdater>(MSSA)),
         HoistingGeps(false),
         HoistedCtr(0)
   { }
@@ -249,9 +266,11 @@ private:
   AliasAnalysis *AA;
   MemoryDependenceResults *MD;
   MemorySSA *MSSA;
+  std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
   const bool HoistingGeps;
   DenseMap<const Value *, unsigned> DFSNumber;
   BBSideEffectsSet BBSideEffects;
+  DenseSet<const BasicBlock*> HoistBarrier;
   int HoistedCtr;
 
   enum InsKind { Unknown, Scalar, Load, Store };
@@ -307,8 +326,8 @@ private:
         continue;
       }
 
-      // Check for end of function, calls that do not return, etc.
-      if (!isGuaranteedToTransferExecutionToSuccessor(BB->getTerminator()))
+      // We reached the leaf Basic Block => not all paths have this instruction.
+      if (!BB->getTerminator()->getNumSuccessors())
         return false;
 
       // When reaching the back-edge of a loop, there may be a path through the
@@ -360,7 +379,7 @@ private:
             ReachedNewPt = true;
           }
         }
-        if (defClobbersUseOrDef(Def, MU, *AA))
+        if (MemorySSAUtil::defClobbersUseOrDef(Def, MU, *AA))
           return true;
       }
 
@@ -387,7 +406,8 @@ private:
     // executed between the execution of NewBB and OldBB. Hoisting an expression
     // from OldBB into NewBB has to be safe on all execution paths.
     for (auto I = idf_begin(OldBB), E = idf_end(OldBB); I != E;) {
-      if (*I == NewBB) {
+      const BasicBlock *BB = *I;
+      if (BB == NewBB) {
         // Stop traversal when reaching HoistPt.
         I.skipChildren();
         continue;
@@ -398,11 +418,17 @@ private:
         return true;
 
       // Impossible to hoist with exceptions on the path.
-      if (hasEH(*I))
+      if (hasEH(BB))
+        return true;
+
+      // No such instruction after HoistBarrier in a basic block was
+      // selected for hoisting so instructions selected within basic block with
+      // a hoist barrier can be hoisted.
+      if ((BB != OldBB) && HoistBarrier.count(BB))
         return true;
 
       // Check that we do not move a store past loads.
-      if (hasMemoryUse(NewPt, Def, *I))
+      if (hasMemoryUse(NewPt, Def, BB))
         return true;
 
       // -1 is unlimited number of blocks on all paths.
@@ -419,17 +445,18 @@ private:
   // Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and
   // return true when the counter NBBsOnAllPaths reaches 0, except when it is
   // initialized to -1 which is unlimited.
-  bool hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *BB,
+  bool hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *SrcBB,
                    int &NBBsOnAllPaths) {
-    assert(DT->dominates(HoistPt, BB) && "Invalid path");
+    assert(DT->dominates(HoistPt, SrcBB) && "Invalid path");
 
     // Walk all basic blocks reachable in depth-first iteration on
     // the inverse CFG from BBInsn to NewHoistPt. These blocks are all the
     // blocks that may be executed between the execution of NewHoistPt and
     // BBInsn. Hoisting an expression from BBInsn into NewHoistPt has to be safe
     // on all execution paths.
-    for (auto I = idf_begin(BB), E = idf_end(BB); I != E;) {
-      if (*I == HoistPt) {
+    for (auto I = idf_begin(SrcBB), E = idf_end(SrcBB); I != E;) {
+      const BasicBlock *BB = *I;
+      if (BB == HoistPt) {
         // Stop traversal when reaching NewHoistPt.
         I.skipChildren();
         continue;
@@ -440,7 +467,13 @@ private:
         return true;
 
       // Impossible to hoist with exceptions on the path.
-      if (hasEH(*I))
+      if (hasEH(BB))
+        return true;
+
+      // No such instruction after HoistBarrier in a basic block was
+      // selected for hoisting so instructions selected within basic block with
+      // a hoist barrier can be hoisted.
+      if ((BB != SrcBB) && HoistBarrier.count(BB))
         return true;
 
       // -1 is unlimited number of blocks on all paths.
@@ -626,6 +659,8 @@ private:
       // Compute the insertion point and the list of expressions to be hoisted.
       SmallVecInsn InstructionsToHoist;
       for (auto I : V)
+        // We don't need to check for hoist-barriers here because if
+        // I->getParent() is a barrier then I precedes the barrier.
         if (!hasEH(I->getParent()))
           InstructionsToHoist.push_back(I);
 
@@ -809,9 +844,9 @@ private:
           // legal when the ld/st is not moved past its current definition.
           MemoryAccess *Def = OldMemAcc->getDefiningAccess();
           NewMemAcc =
-              MSSA->createMemoryAccessInBB(Repl, Def, HoistPt, MemorySSA::End);
+            MSSAUpdater->createMemoryAccessInBB(Repl, Def, HoistPt, MemorySSA::End);
           OldMemAcc->replaceAllUsesWith(NewMemAcc);
-          MSSA->removeMemoryAccess(OldMemAcc);
+          MSSAUpdater->removeMemoryAccess(OldMemAcc);
         }
       }
 
@@ -850,7 +885,7 @@ private:
             // Update the uses of the old MSSA access with NewMemAcc.
             MemoryAccess *OldMA = MSSA->getMemoryAccess(I);
             OldMA->replaceAllUsesWith(NewMemAcc);
-            MSSA->removeMemoryAccess(OldMA);
+            MSSAUpdater->removeMemoryAccess(OldMA);
           }
 
           Repl->andIRFlags(I);
@@ -872,7 +907,7 @@ private:
           auto In = Phi->incoming_values();
           if (all_of(In, [&](Use &U) { return U == NewMemAcc; })) {
             Phi->replaceAllUsesWith(NewMemAcc);
-            MSSA->removeMemoryAccess(Phi);
+            MSSAUpdater->removeMemoryAccess(Phi);
           }
         }
       }
@@ -896,6 +931,12 @@ private:
     for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
       int InstructionNb = 0;
       for (Instruction &I1 : *BB) {
+        // If I1 cannot guarantee progress, subsequent instructions
+        // in BB cannot be hoisted anyways.
+        if (!isGuaranteedToTransferExecutionToSuccessor(&I1)) {
+           HoistBarrier.insert(BB);
+           break;
+        }
         // Only hoist the first instructions in BB up to MaxDepthInBB. Hoisting
         // deeper may increase the register pressure and compilation time.
         if (MaxDepthInBB != -1 && InstructionNb++ >= MaxDepthInBB)
diff --git a/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp
index b05ef002a456..7019287954a1 100644
--- a/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp
@@ -568,8 +568,7 @@ bool GuardWideningImpl::combineRangeChecks(
       return RC.getBase() == CurrentBase && RC.getLength() == CurrentLength;
     };
 
-    std::copy_if(Checks.begin(), Checks.end(),
-                 std::back_inserter(CurrentChecks), IsCurrentCheck);
+    copy_if(Checks, std::back_inserter(CurrentChecks), IsCurrentCheck);
     Checks.erase(remove_if(Checks, IsCurrentCheck), Checks.end());
 
     assert(CurrentChecks.size() != 0 && "We know we have at least one!");
@@ -658,8 +657,12 @@ PreservedAnalyses GuardWideningPass::run(Function &F,
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &LI = AM.getResult<LoopAnalysis>(F);
   auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
-  bool Changed = GuardWideningImpl(DT, PDT, LI).run();
-  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+  if (!GuardWideningImpl(DT, PDT, LI).run())
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
 }
 
 StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) {
diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 1752fb75eb1b..dcb2a4a0c6e6 100644
--- a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -231,8 +231,9 @@ static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) {
   bool isExact = false;
   // See if we can convert this to an int64_t
   uint64_t UIntVal;
-  if (APF.convertToInteger(&UIntVal, 64, true, APFloat::rmTowardZero,
-                           &isExact) != APFloat::opOK || !isExact)
+  if (APF.convertToInteger(makeMutableArrayRef(UIntVal), 64, true,
+                           APFloat::rmTowardZero, &isExact) != APFloat::opOK ||
+      !isExact)
     return false;
   IntVal = UIntVal;
   return true;
@@ -906,7 +907,7 @@ class WidenIV {
   SmallVector<NarrowIVDefUse, 8> NarrowIVUsers;
 
   enum ExtendKind { ZeroExtended, SignExtended, Unknown };
-  // A map tracking the kind of extension used to widen each narrow IV 
+  // A map tracking the kind of extension used to widen each narrow IV
   // and narrow IV user.
   // Key: pointer to a narrow IV or IV user.
   // Value: the kind of extension used to widen this Instruction.
@@ -1608,7 +1609,7 @@ void WidenIV::calculatePostIncRange(Instruction *NarrowDef,
       return;
 
     CmpInst::Predicate P =
-            TrueDest ? Pred : CmpInst::getInversePredicate(Pred);  
+            TrueDest ? Pred : CmpInst::getInversePredicate(Pred);
 
     auto CmpRHSRange = SE->getSignedRange(SE->getSCEV(CmpRHS));
     auto CmpConstrainedLHSRange =
@@ -1634,7 +1635,7 @@ void WidenIV::calculatePostIncRange(Instruction *NarrowDef,
   UpdateRangeFromGuards(NarrowUser);
 
   BasicBlock *NarrowUserBB = NarrowUser->getParent();
-  // If NarrowUserBB is statically unreachable asking dominator queries may 
+  // If NarrowUserBB is statically unreachable asking dominator queries may
   // yield surprising results. (e.g. the block may not have a dom tree node)
   if (!DT->isReachableFromEntry(NarrowUserBB))
     return;
@@ -2152,6 +2153,8 @@ linearFunctionTestReplace(Loop *L,
   Value *CmpIndVar = IndVar;
   const SCEV *IVCount = BackedgeTakenCount;
 
+  assert(L->getLoopLatch() && "Loop no longer in simplified form?");
+
   // If the exiting block is the same as the backedge block, we prefer to
   // compare against the post-incremented value, otherwise we must compare
   // against the preincremented value.
@@ -2376,6 +2379,7 @@ bool IndVarSimplify::run(Loop *L) {
   //    Loop::getCanonicalInductionVariable only supports loops with preheaders,
   //    and we're in trouble if we can't find the induction variable even when
   //    we've manually inserted one.
+  //  - LFTR relies on having a single backedge.
   if (!L->isLoopSimplifyForm())
     return false;
 
@@ -2492,8 +2496,9 @@ PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
   if (!IVS.run(&L))
     return PreservedAnalyses::all();
 
-  // FIXME: This should also 'preserve the CFG'.
-  return getLoopPassPreservedAnalyses();
+  auto PA = getLoopPassPreservedAnalyses();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
 }
 
 namespace {
diff --git a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index 8e81541c2337..85db6e5e1105 100644
--- a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -446,6 +446,15 @@ struct LoopStructure {
   BasicBlock *LatchExit;
   unsigned LatchBrExitIdx;
 
+  // The loop represented by this instance of LoopStructure is semantically
+  // equivalent to:
+  //
+  // intN_ty inc = IndVarIncreasing ? 1 : -1;
+  // pred_ty predicate = IndVarIncreasing ? ICMP_SLT : ICMP_SGT;
+  //
+  // for (intN_ty iv = IndVarStart; predicate(iv, LoopExitAt); iv = IndVarNext)
+  //   ... body ...
+
   Value *IndVarNext;
   Value *IndVarStart;
   Value *LoopExitAt;
@@ -789,6 +798,10 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
     return None;
   }
 
+  const SCEV *StartNext = IndVarNext->getStart();
+  const SCEV *Addend = SE.getNegativeSCEV(IndVarNext->getStepRecurrence(SE));
+  const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend);
+
   ConstantInt *One = ConstantInt::get(IndVarTy, 1);
   // TODO: generalize the predicates here to also match their unsigned variants.
   if (IsIncreasing) {
@@ -809,10 +822,22 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
         return None;
       }
 
+      if (!SE.isLoopEntryGuardedByCond(
+              &L, CmpInst::ICMP_SLT, IndVarStart,
+              SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType())))) {
+        FailureReason = "Induction variable start not bounded by upper limit";
+        return None;
+      }
+
       IRBuilder<> B(Preheader->getTerminator());
       RightValue = B.CreateAdd(RightValue, One);
+    } else {
+      if (!SE.isLoopEntryGuardedByCond(&L, CmpInst::ICMP_SLT, IndVarStart,
+                                       RightSCEV)) {
+        FailureReason = "Induction variable start not bounded by upper limit";
+        return None;
+      }
     }
-
   } else {
     bool FoundExpectedPred =
         (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 1) ||
@@ -831,15 +856,24 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
         return None;
       }
 
+      if (!SE.isLoopEntryGuardedByCond(
+              &L, CmpInst::ICMP_SGT, IndVarStart,
+              SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType())))) {
+        FailureReason = "Induction variable start not bounded by lower limit";
+        return None;
+      }
+
       IRBuilder<> B(Preheader->getTerminator());
       RightValue = B.CreateSub(RightValue, One);
+    } else {
+      if (!SE.isLoopEntryGuardedByCond(&L, CmpInst::ICMP_SGT, IndVarStart,
+                                       RightSCEV)) {
+        FailureReason = "Induction variable start not bounded by lower limit";
+        return None;
+      }
     }
   }
 
-  const SCEV *StartNext = IndVarNext->getStart();
-  const SCEV *Addend = SE.getNegativeSCEV(IndVarNext->getStepRecurrence(SE));
-  const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend);
-
   BasicBlock *LatchExit = LatchBr->getSuccessor(LatchBrExitIdx);
 
   assert(SE.getLoopDisposition(LatchCount, &L) ==
diff --git a/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
new file mode 100644
index 000000000000..5d8701431a2c
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -0,0 +1,903 @@
+//===-- NVPTXInferAddressSpace.cpp - ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// CUDA C/C++ includes memory space designation as variable type qualifers (such
+// as __global__ and __shared__). Knowing the space of a memory access allows
+// CUDA compilers to emit faster PTX loads and stores. For example, a load from
+// shared memory can be translated to `ld.shared` which is roughly 10% faster
+// than a generic `ld` on an NVIDIA Tesla K40c.
+//
+// Unfortunately, type qualifiers only apply to variable declarations, so CUDA
+// compilers must infer the memory space of an address expression from
+// type-qualified variables.
+//
+// LLVM IR uses non-zero (so-called) specific address spaces to represent memory
+// spaces (e.g. addrspace(3) means shared memory). The Clang frontend
+// places only type-qualified variables in specific address spaces, and then
+// conservatively `addrspacecast`s each type-qualified variable to addrspace(0)
+// (so-called the generic address space) for other instructions to use.
+//
+// For example, the Clang translates the following CUDA code
+//   __shared__ float a[10];
+//   float v = a[i];
+// to
+//   %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]*
+//   %1 = gep [10 x float], [10 x float]* %0, i64 0, i64 %i
+//   %v = load float, float* %1 ; emits ld.f32
+// @a is in addrspace(3) since it's type-qualified, but its use from %1 is
+// redirected to %0 (the generic version of @a).
+//
+// The optimization implemented in this file propagates specific address spaces
+// from type-qualified variable declarations to its users. For example, it
+// optimizes the above IR to
+//   %1 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
+//   %v = load float addrspace(3)* %1 ; emits ld.shared.f32
+// propagating the addrspace(3) from @a to %1. As the result, the NVPTX
+// codegen is able to emit ld.shared.f32 for %v.
+//
+// Address space inference works in two steps. First, it uses a data-flow
+// analysis to infer as many generic pointers as possible to point to only one
+// specific address space. In the above example, it can prove that %1 only
+// points to addrspace(3). This algorithm was published in
+//   CUDA: Compiling and optimizing for a GPU platform
+//   Chakrabarti, Grover, Aarts, Kong, Kudlur, Lin, Marathe, Murphy, Wang
+//   ICCS 2012
+//
+// Then, address space inference replaces all refinable generic pointers with
+// equivalent specific pointers.
+//
+// The major challenge of implementing this optimization is handling PHINodes,
+// which may create loops in the data flow graph. This brings two complications.
+//
+// First, the data flow analysis in Step 1 needs to be circular. For example,
+//     %generic.input = addrspacecast float addrspace(3)* %input to float*
+//   loop:
+//     %y = phi [ %generic.input, %y2 ]
+//     %y2 = getelementptr %y, 1
+//     %v = load %y2
+//     br ..., label %loop, ...
+// proving %y specific requires proving both %generic.input and %y2 specific,
+// but proving %y2 specific circles back to %y. To address this complication,
+// the data flow analysis operates on a lattice:
+//   uninitialized > specific address spaces > generic.
+// All address expressions (our implementation only considers phi, bitcast,
+// addrspacecast, and getelementptr) start with the uninitialized address space.
+// The monotone transfer function moves the address space of a pointer down a
+// lattice path from uninitialized to specific and then to generic. A join
+// operation of two different specific address spaces pushes the expression down
+// to the generic address space. The analysis completes once it reaches a fixed
+// point.
+//
+// Second, IR rewriting in Step 2 also needs to be circular. For example,
+// converting %y to addrspace(3) requires the compiler to know the converted
+// %y2, but converting %y2 needs the converted %y. To address this complication,
+// we break these cycles using "undef" placeholders. When converting an
+// instruction `I` to a new address space, if its operand `Op` is not converted
+// yet, we let `I` temporarily use `undef` and fix all the uses of undef later.
+// For instance, our algorithm first converts %y to
+//   %y' = phi float addrspace(3)* [ %input, undef ]
+// Then, it converts %y2 to
+//   %y2' = getelementptr %y', 1
+// Finally, it fixes the undef in %y' so that
+//   %y' = phi float addrspace(3)* [ %input, %y2' ]
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+
+#define DEBUG_TYPE "infer-address-spaces"
+
+using namespace llvm;
+
+namespace {
+static const unsigned UninitializedAddressSpace = ~0u;
+
+using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>;
+
+/// \brief InferAddressSpaces
+class InferAddressSpaces : public FunctionPass {
+  /// Target specific address space which uses of should be replaced if
+  /// possible.
+  unsigned FlatAddrSpace;
+
+public:
+  static char ID;
+
+  InferAddressSpaces() : FunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override;
+
+private:
+  // Returns the new address space of V if updated; otherwise, returns None.
+  Optional<unsigned>
+  updateAddressSpace(const Value &V,
+                     const ValueToAddrSpaceMapTy &InferredAddrSpace) const;
+
+  // Tries to infer the specific address space of each address expression in
+  // Postorder.
+  void inferAddressSpaces(const std::vector<Value *> &Postorder,
+                          ValueToAddrSpaceMapTy *InferredAddrSpace) const;
+
+  bool isSafeToCastConstAddrSpace(Constant *C, unsigned NewAS) const;
+
+  // Changes the flat address expressions in function F to point to specific
+  // address spaces if InferredAddrSpace says so. Postorder is the postorder of
+  // all flat expressions in the use-def graph of function F.
+  bool
+  rewriteWithNewAddressSpaces(const std::vector<Value *> &Postorder,
+                              const ValueToAddrSpaceMapTy &InferredAddrSpace,
+                              Function *F) const;
+
+  void appendsFlatAddressExpressionToPostorderStack(
+    Value *V, std::vector<std::pair<Value *, bool>> *PostorderStack,
+    DenseSet<Value *> *Visited) const;
+
+  bool rewriteIntrinsicOperands(IntrinsicInst *II,
+                                Value *OldV, Value *NewV) const;
+  void collectRewritableIntrinsicOperands(
+    IntrinsicInst *II,
+    std::vector<std::pair<Value *, bool>> *PostorderStack,
+    DenseSet<Value *> *Visited) const;
+
+  std::vector<Value *> collectFlatAddressExpressions(Function &F) const;
+
+  Value *cloneValueWithNewAddressSpace(
+    Value *V, unsigned NewAddrSpace,
+    const ValueToValueMapTy &ValueWithNewAddrSpace,
+    SmallVectorImpl<const Use *> *UndefUsesToFix) const;
+  unsigned joinAddressSpaces(unsigned AS1, unsigned AS2) const;
+};
+} // end anonymous namespace
+
+char InferAddressSpaces::ID = 0;
+
+namespace llvm {
+void initializeInferAddressSpacesPass(PassRegistry &);
+}
+
+INITIALIZE_PASS(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces",
+                false, false)
+
+// Returns true if V is an address expression.
+// TODO: Currently, we consider only phi, bitcast, addrspacecast, and
+// getelementptr operators.
+static bool isAddressExpression(const Value &V) {
+  if (!isa<Operator>(V))
+    return false;
+
+  switch (cast<Operator>(V).getOpcode()) {
+  case Instruction::PHI:
+  case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
+  case Instruction::GetElementPtr:
+  case Instruction::Select:
+    return true;
+  default:
+    return false;
+  }
+}
+
+// Returns the pointer operands of V.
+//
+// Precondition: V is an address expression.
+static SmallVector<Value *, 2> getPointerOperands(const Value &V) {
+  assert(isAddressExpression(V));
+  const Operator &Op = cast<Operator>(V);
+  switch (Op.getOpcode()) {
+  case Instruction::PHI: {
+    auto IncomingValues = cast<PHINode>(Op).incoming_values();
+    return SmallVector<Value *, 2>(IncomingValues.begin(),
+                                   IncomingValues.end());
+  }
+  case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
+  case Instruction::GetElementPtr:
+    return {Op.getOperand(0)};
+  case Instruction::Select:
+    return {Op.getOperand(1), Op.getOperand(2)};
+  default:
+    llvm_unreachable("Unexpected instruction type.");
+  }
+}
+
+// TODO: Move logic to TTI?
+bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II,
+                                                  Value *OldV,
+                                                  Value *NewV) const {
+  Module *M = II->getParent()->getParent()->getParent();
+
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::amdgcn_atomic_inc:
+  case Intrinsic::amdgcn_atomic_dec:{
+    const ConstantInt *IsVolatile = dyn_cast<ConstantInt>(II->getArgOperand(4));
+    if (!IsVolatile || !IsVolatile->isNullValue())
+      return false;
+
+    LLVM_FALLTHROUGH;
+  }
+  case Intrinsic::objectsize: {
+    Type *DestTy = II->getType();
+    Type *SrcTy = NewV->getType();
+    Function *NewDecl =
+        Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
+    II->setArgOperand(0, NewV);
+    II->setCalledFunction(NewDecl);
+    return true;
+  }
+  default:
+    return false;
+  }
+}
+
+// TODO: Move logic to TTI?
+void InferAddressSpaces::collectRewritableIntrinsicOperands(
+    IntrinsicInst *II, std::vector<std::pair<Value *, bool>> *PostorderStack,
+    DenseSet<Value *> *Visited) const {
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::objectsize:
+  case Intrinsic::amdgcn_atomic_inc:
+  case Intrinsic::amdgcn_atomic_dec:
+    appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0),
+                                                 PostorderStack, Visited);
+    break;
+  default:
+    break;
+  }
+}
+
+// Returns all flat address expressions in function F. The elements are
+// If V is an unvisited flat address expression, appends V to PostorderStack
+// and marks it as visited.
+void InferAddressSpaces::appendsFlatAddressExpressionToPostorderStack(
+    Value *V, std::vector<std::pair<Value *, bool>> *PostorderStack,
+    DenseSet<Value *> *Visited) const {
+  assert(V->getType()->isPointerTy());
+  if (isAddressExpression(*V) &&
+      V->getType()->getPointerAddressSpace() == FlatAddrSpace) {
+    if (Visited->insert(V).second)
+      PostorderStack->push_back(std::make_pair(V, false));
+  }
+}
+
+// Returns all flat address expressions in function F. The elements are ordered
+// ordered in postorder.
+std::vector<Value *>
+InferAddressSpaces::collectFlatAddressExpressions(Function &F) const {
+  // This function implements a non-recursive postorder traversal of a partial
+  // use-def graph of function F.
+  std::vector<std::pair<Value *, bool>> PostorderStack;
+  // The set of visited expressions.
+  DenseSet<Value *> Visited;
+
+  auto PushPtrOperand = [&](Value *Ptr) {
+    appendsFlatAddressExpressionToPostorderStack(Ptr, &PostorderStack,
+                                                 &Visited);
+  };
+
+  // We only explore address expressions that are reachable from loads and
+  // stores for now because we aim at generating faster loads and stores.
+  for (Instruction &I : instructions(F)) {
+    if (auto *LI = dyn_cast<LoadInst>(&I))
+      PushPtrOperand(LI->getPointerOperand());
+    else if (auto *SI = dyn_cast<StoreInst>(&I))
+      PushPtrOperand(SI->getPointerOperand());
+    else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I))
+      PushPtrOperand(RMW->getPointerOperand());
+    else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(&I))
+      PushPtrOperand(CmpX->getPointerOperand());
+    else if (auto *MI = dyn_cast<MemIntrinsic>(&I)) {
+      // For memset/memcpy/memmove, any pointer operand can be replaced.
+      PushPtrOperand(MI->getRawDest());
+
+      // Handle 2nd operand for memcpy/memmove.
+      if (auto *MTI = dyn_cast<MemTransferInst>(MI))
+        PushPtrOperand(MTI->getRawSource());
+    } else if (auto *II = dyn_cast<IntrinsicInst>(&I))
+      collectRewritableIntrinsicOperands(II, &PostorderStack, &Visited);
+    else if (ICmpInst *Cmp = dyn_cast<ICmpInst>(&I)) {
+      // FIXME: Handle vectors of pointers
+      if (Cmp->getOperand(0)->getType()->isPointerTy()) {
+        PushPtrOperand(Cmp->getOperand(0));
+        PushPtrOperand(Cmp->getOperand(1));
+      }
+    }
+  }
+
+  std::vector<Value *> Postorder; // The resultant postorder.
+  while (!PostorderStack.empty()) {
+    // If the operands of the expression on the top are already explored,
+    // adds that expression to the resultant postorder.
+    if (PostorderStack.back().second) {
+      Postorder.push_back(PostorderStack.back().first);
+      PostorderStack.pop_back();
+      continue;
+    }
+    // Otherwise, adds its operands to the stack and explores them.
+    PostorderStack.back().second = true;
+    for (Value *PtrOperand : getPointerOperands(*PostorderStack.back().first)) {
+      appendsFlatAddressExpressionToPostorderStack(PtrOperand, &PostorderStack,
+                                                   &Visited);
+    }
+  }
+  return Postorder;
+}
+
+// A helper function for cloneInstructionWithNewAddressSpace. Returns the clone
+// of OperandUse.get() in the new address space. If the clone is not ready yet,
+// returns an undef in the new address space as a placeholder.
+static Value *operandWithNewAddressSpaceOrCreateUndef(
+    const Use &OperandUse, unsigned NewAddrSpace,
+    const ValueToValueMapTy &ValueWithNewAddrSpace,
+    SmallVectorImpl<const Use *> *UndefUsesToFix) {
+  Value *Operand = OperandUse.get();
+
+  Type *NewPtrTy =
+      Operand->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+  if (Constant *C = dyn_cast<Constant>(Operand))
+    return ConstantExpr::getAddrSpaceCast(C, NewPtrTy);
+
+  if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand))
+    return NewOperand;
+
+  UndefUsesToFix->push_back(&OperandUse);
+  return UndefValue::get(NewPtrTy);
+}
+
+// Returns a clone of `I` with its operands converted to those specified in
+// ValueWithNewAddrSpace. Due to potential cycles in the data flow graph, an
+// operand whose address space needs to be modified might not exist in
+// ValueWithNewAddrSpace. In that case, uses undef as a placeholder operand and
+// adds that operand use to UndefUsesToFix so that caller can fix them later.
+//
+// Note that we do not necessarily clone `I`, e.g., if it is an addrspacecast
+// from a pointer whose type already matches. Therefore, this function returns a
+// Value* instead of an Instruction*.
+static Value *cloneInstructionWithNewAddressSpace(
+    Instruction *I, unsigned NewAddrSpace,
+    const ValueToValueMapTy &ValueWithNewAddrSpace,
+    SmallVectorImpl<const Use *> *UndefUsesToFix) {
+  Type *NewPtrType =
+      I->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+  if (I->getOpcode() == Instruction::AddrSpaceCast) {
+    Value *Src = I->getOperand(0);
+    // Because `I` is flat, the source address space must be specific.
+    // Therefore, the inferred address space must be the source space, according
+    // to our algorithm.
+    assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace);
+    if (Src->getType() != NewPtrType)
+      return new BitCastInst(Src, NewPtrType);
+    return Src;
+  }
+
+  // Computes the converted pointer operands.
+  SmallVector<Value *, 4> NewPointerOperands;
+  for (const Use &OperandUse : I->operands()) {
+    if (!OperandUse.get()->getType()->isPointerTy())
+      NewPointerOperands.push_back(nullptr);
+    else
+      NewPointerOperands.push_back(operandWithNewAddressSpaceOrCreateUndef(
+                                     OperandUse, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix));
+  }
+
+  switch (I->getOpcode()) {
+  case Instruction::BitCast:
+    return new BitCastInst(NewPointerOperands[0], NewPtrType);
+  case Instruction::PHI: {
+    assert(I->getType()->isPointerTy());
+    PHINode *PHI = cast<PHINode>(I);
+    PHINode *NewPHI = PHINode::Create(NewPtrType, PHI->getNumIncomingValues());
+    for (unsigned Index = 0; Index < PHI->getNumIncomingValues(); ++Index) {
+      unsigned OperandNo = PHINode::getOperandNumForIncomingValue(Index);
+      NewPHI->addIncoming(NewPointerOperands[OperandNo],
+                          PHI->getIncomingBlock(Index));
+    }
+    return NewPHI;
+  }
+  case Instruction::GetElementPtr: {
+    GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
+    GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
+        GEP->getSourceElementType(), NewPointerOperands[0],
+        SmallVector<Value *, 4>(GEP->idx_begin(), GEP->idx_end()));
+    NewGEP->setIsInBounds(GEP->isInBounds());
+    return NewGEP;
+  }
+  case Instruction::Select: {
+    assert(I->getType()->isPointerTy());
+    return SelectInst::Create(I->getOperand(0), NewPointerOperands[1],
+                              NewPointerOperands[2], "", nullptr, I);
+  }
+  default:
+    llvm_unreachable("Unexpected opcode");
+  }
+}
+
+// Similar to cloneInstructionWithNewAddressSpace, returns a clone of the
+// constant expression `CE` with its operands replaced as specified in
+// ValueWithNewAddrSpace.
+static Value *cloneConstantExprWithNewAddressSpace(
+  ConstantExpr *CE, unsigned NewAddrSpace,
+  const ValueToValueMapTy &ValueWithNewAddrSpace) {
+  Type *TargetType =
+    CE->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+  if (CE->getOpcode() == Instruction::AddrSpaceCast) {
+    // Because CE is flat, the source address space must be specific.
+    // Therefore, the inferred address space must be the source space according
+    // to our algorithm.
+    assert(CE->getOperand(0)->getType()->getPointerAddressSpace() ==
+           NewAddrSpace);
+    return ConstantExpr::getBitCast(CE->getOperand(0), TargetType);
+  }
+
+  if (CE->getOpcode() == Instruction::BitCast) {
+    if (Value *NewOperand = ValueWithNewAddrSpace.lookup(CE->getOperand(0)))
+      return ConstantExpr::getBitCast(cast<Constant>(NewOperand), TargetType);
+    return ConstantExpr::getAddrSpaceCast(CE, TargetType);
+  }
+
+  if (CE->getOpcode() == Instruction::Select) {
+    Constant *Src0 = CE->getOperand(1);
+    Constant *Src1 = CE->getOperand(2);
+    if (Src0->getType()->getPointerAddressSpace() ==
+        Src1->getType()->getPointerAddressSpace()) {
+
+      return ConstantExpr::getSelect(
+          CE->getOperand(0), ConstantExpr::getAddrSpaceCast(Src0, TargetType),
+          ConstantExpr::getAddrSpaceCast(Src1, TargetType));
+    }
+  }
+
+  // Computes the operands of the new constant expression.
+  SmallVector<Constant *, 4> NewOperands;
+  for (unsigned Index = 0; Index < CE->getNumOperands(); ++Index) {
+    Constant *Operand = CE->getOperand(Index);
+    // If the address space of `Operand` needs to be modified, the new operand
+    // with the new address space should already be in ValueWithNewAddrSpace
+    // because (1) the constant expressions we consider (i.e. addrspacecast,
+    // bitcast, and getelementptr) do not incur cycles in the data flow graph
+    // and (2) this function is called on constant expressions in postorder.
+    if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand)) {
+      NewOperands.push_back(cast<Constant>(NewOperand));
+    } else {
+      // Otherwise, reuses the old operand.
+      NewOperands.push_back(Operand);
+    }
+  }
+
+  if (CE->getOpcode() == Instruction::GetElementPtr) {
+    // Needs to specify the source type while constructing a getelementptr
+    // constant expression.
+    return CE->getWithOperands(
+      NewOperands, TargetType, /*OnlyIfReduced=*/false,
+      NewOperands[0]->getType()->getPointerElementType());
+  }
+
+  return CE->getWithOperands(NewOperands, TargetType);
+}
+
+// Returns a clone of the value `V`, with its operands replaced as specified in
+// ValueWithNewAddrSpace. This function is called on every flat address
+// expression whose address space needs to be modified, in postorder.
+//
+// See cloneInstructionWithNewAddressSpace for the meaning of UndefUsesToFix.
+Value *InferAddressSpaces::cloneValueWithNewAddressSpace(
+  Value *V, unsigned NewAddrSpace,
+  const ValueToValueMapTy &ValueWithNewAddrSpace,
+  SmallVectorImpl<const Use *> *UndefUsesToFix) const {
+  // All values in Postorder are flat address expressions.
+  assert(isAddressExpression(*V) &&
+         V->getType()->getPointerAddressSpace() == FlatAddrSpace);
+
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    Value *NewV = cloneInstructionWithNewAddressSpace(
+      I, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix);
+    if (Instruction *NewI = dyn_cast<Instruction>(NewV)) {
+      if (NewI->getParent() == nullptr) {
+        NewI->insertBefore(I);
+        NewI->takeName(I);
+      }
+    }
+    return NewV;
+  }
+
+  return cloneConstantExprWithNewAddressSpace(
+    cast<ConstantExpr>(V), NewAddrSpace, ValueWithNewAddrSpace);
+}
+
+// Defines the join operation on the address space lattice (see the file header
+// comments).
+unsigned InferAddressSpaces::joinAddressSpaces(unsigned AS1,
+                                               unsigned AS2) const {
+  if (AS1 == FlatAddrSpace || AS2 == FlatAddrSpace)
+    return FlatAddrSpace;
+
+  if (AS1 == UninitializedAddressSpace)
+    return AS2;
+  if (AS2 == UninitializedAddressSpace)
+    return AS1;
+
+  // The join of two different specific address spaces is flat.
+  return (AS1 == AS2) ? AS1 : FlatAddrSpace;
+}
+
+bool InferAddressSpaces::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  const TargetTransformInfo &TTI =
+      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  FlatAddrSpace = TTI.getFlatAddressSpace();
+  if (FlatAddrSpace == UninitializedAddressSpace)
+    return false;
+
+  // Collects all flat address expressions in postorder.
+  std::vector<Value *> Postorder = collectFlatAddressExpressions(F);
+
+  // Runs a data-flow analysis to refine the address spaces of every expression
+  // in Postorder.
+  ValueToAddrSpaceMapTy InferredAddrSpace;
+  inferAddressSpaces(Postorder, &InferredAddrSpace);
+
+  // Changes the address spaces of the flat address expressions who are inferred
+  // to point to a specific address space.
+  return rewriteWithNewAddressSpaces(Postorder, InferredAddrSpace, &F);
+}
+
+void InferAddressSpaces::inferAddressSpaces(
+    const std::vector<Value *> &Postorder,
+    ValueToAddrSpaceMapTy *InferredAddrSpace) const {
+  SetVector<Value *> Worklist(Postorder.begin(), Postorder.end());
+  // Initially, all expressions are in the uninitialized address space.
+  for (Value *V : Postorder)
+    (*InferredAddrSpace)[V] = UninitializedAddressSpace;
+
+  while (!Worklist.empty()) {
+    Value *V = Worklist.pop_back_val();
+
+    // Tries to update the address space of the stack top according to the
+    // address spaces of its operands.
+    DEBUG(dbgs() << "Updating the address space of\n  " << *V << '\n');
+    Optional<unsigned> NewAS = updateAddressSpace(*V, *InferredAddrSpace);
+    if (!NewAS.hasValue())
+      continue;
+    // If any updates are made, grabs its users to the worklist because
+    // their address spaces can also be possibly updated.
+    DEBUG(dbgs() << "  to " << NewAS.getValue() << '\n');
+    (*InferredAddrSpace)[V] = NewAS.getValue();
+
+    for (Value *User : V->users()) {
+      // Skip if User is already in the worklist.
+      if (Worklist.count(User))
+        continue;
+
+      auto Pos = InferredAddrSpace->find(User);
+      // Our algorithm only updates the address spaces of flat address
+      // expressions, which are those in InferredAddrSpace.
+      if (Pos == InferredAddrSpace->end())
+        continue;
+
+      // Function updateAddressSpace moves the address space down a lattice
+      // path. Therefore, nothing to do if User is already inferred as flat (the
+      // bottom element in the lattice).
+      if (Pos->second == FlatAddrSpace)
+        continue;
+
+      Worklist.insert(User);
+    }
+  }
+}
+
+Optional<unsigned> InferAddressSpaces::updateAddressSpace(
+    const Value &V, const ValueToAddrSpaceMapTy &InferredAddrSpace) const {
+  assert(InferredAddrSpace.count(&V));
+
+  // The new inferred address space equals the join of the address spaces
+  // of all its pointer operands.
+  unsigned NewAS = UninitializedAddressSpace;
+
+  const Operator &Op = cast<Operator>(V);
+  if (Op.getOpcode() == Instruction::Select) {
+    Value *Src0 = Op.getOperand(1);
+    Value *Src1 = Op.getOperand(2);
+
+    auto I = InferredAddrSpace.find(Src0);
+    unsigned Src0AS = (I != InferredAddrSpace.end()) ?
+      I->second : Src0->getType()->getPointerAddressSpace();
+
+    auto J = InferredAddrSpace.find(Src1);
+    unsigned Src1AS = (J != InferredAddrSpace.end()) ?
+      J->second : Src1->getType()->getPointerAddressSpace();
+
+    auto *C0 = dyn_cast<Constant>(Src0);
+    auto *C1 = dyn_cast<Constant>(Src1);
+
+    // If one of the inputs is a constant, we may be able to do a constant
+    // addrspacecast of it. Defer inferring the address space until the input
+    // address space is known.
+    if ((C1 && Src0AS == UninitializedAddressSpace) ||
+        (C0 && Src1AS == UninitializedAddressSpace))
+      return None;
+
+    if (C0 && isSafeToCastConstAddrSpace(C0, Src1AS))
+      NewAS = Src1AS;
+    else if (C1 && isSafeToCastConstAddrSpace(C1, Src0AS))
+      NewAS = Src0AS;
+    else
+      NewAS = joinAddressSpaces(Src0AS, Src1AS);
+  } else {
+    for (Value *PtrOperand : getPointerOperands(V)) {
+      auto I = InferredAddrSpace.find(PtrOperand);
+      unsigned OperandAS = I != InferredAddrSpace.end() ?
+        I->second : PtrOperand->getType()->getPointerAddressSpace();
+
+      // join(flat, *) = flat. So we can break if NewAS is already flat.
+      NewAS = joinAddressSpaces(NewAS, OperandAS);
+      if (NewAS == FlatAddrSpace)
+        break;
+    }
+  }
+
+  unsigned OldAS = InferredAddrSpace.lookup(&V);
+  assert(OldAS != FlatAddrSpace);
+  if (OldAS == NewAS)
+    return None;
+  return NewAS;
+}
+
+/// \p returns true if \p U is the pointer operand of a memory instruction with
+/// a single pointer operand that can have its address space changed by simply
+/// mutating the use to a new value.
+static bool isSimplePointerUseValidToReplace(Use &U) {
+  User *Inst = U.getUser();
+  unsigned OpNo = U.getOperandNo();
+
+  if (auto *LI = dyn_cast<LoadInst>(Inst))
+    return OpNo == LoadInst::getPointerOperandIndex() && !LI->isVolatile();
+
+  if (auto *SI = dyn_cast<StoreInst>(Inst))
+    return OpNo == StoreInst::getPointerOperandIndex() && !SI->isVolatile();
+
+  if (auto *RMW = dyn_cast<AtomicRMWInst>(Inst))
+    return OpNo == AtomicRMWInst::getPointerOperandIndex() && !RMW->isVolatile();
+
+  if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+    return OpNo == AtomicCmpXchgInst::getPointerOperandIndex() &&
+           !CmpX->isVolatile();
+  }
+
+  return false;
+}
+
+/// Update memory intrinsic uses that require more complex processing than
+/// simple memory instructions. Thse require re-mangling and may have multiple
+/// pointer operands.
+static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV,
+                                     Value *NewV) {
+  IRBuilder<> B(MI);
+  MDNode *TBAA = MI->getMetadata(LLVMContext::MD_tbaa);
+  MDNode *ScopeMD = MI->getMetadata(LLVMContext::MD_alias_scope);
+  MDNode *NoAliasMD = MI->getMetadata(LLVMContext::MD_noalias);
+
+  if (auto *MSI = dyn_cast<MemSetInst>(MI)) {
+    B.CreateMemSet(NewV, MSI->getValue(),
+                   MSI->getLength(), MSI->getAlignment(),
+                   false, // isVolatile
+                   TBAA, ScopeMD, NoAliasMD);
+  } else if (auto *MTI = dyn_cast<MemTransferInst>(MI)) {
+    Value *Src = MTI->getRawSource();
+    Value *Dest = MTI->getRawDest();
+
+    // Be careful in case this is a self-to-self copy.
+    if (Src == OldV)
+      Src = NewV;
+
+    if (Dest == OldV)
+      Dest = NewV;
+
+    if (isa<MemCpyInst>(MTI)) {
+      MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct);
+      B.CreateMemCpy(Dest, Src, MTI->getLength(),
+                     MTI->getAlignment(),
+                     false, // isVolatile
+                     TBAA, TBAAStruct, ScopeMD, NoAliasMD);
+    } else {
+      assert(isa<MemMoveInst>(MTI));
+      B.CreateMemMove(Dest, Src, MTI->getLength(),
+                      MTI->getAlignment(),
+                      false, // isVolatile
+                      TBAA, ScopeMD, NoAliasMD);
+    }
+  } else
+    llvm_unreachable("unhandled MemIntrinsic");
+
+  MI->eraseFromParent();
+  return true;
+}
+
+// \p returns true if it is OK to change the address space of constant \p C with
+// a ConstantExpr addrspacecast.
+bool InferAddressSpaces::isSafeToCastConstAddrSpace(Constant *C, unsigned NewAS) const {
+  assert(NewAS != UninitializedAddressSpace);
+
+  unsigned SrcAS = C->getType()->getPointerAddressSpace();
+  if (SrcAS == NewAS || isa<UndefValue>(C))
+    return true;
+
+  // Prevent illegal casts between different non-flat address spaces.
+  if (SrcAS != FlatAddrSpace && NewAS != FlatAddrSpace)
+    return false;
+
+  if (isa<ConstantPointerNull>(C))
+    return true;
+
+  if (auto *Op = dyn_cast<Operator>(C)) {
+    // If we already have a constant addrspacecast, it should be safe to cast it
+    // off.
+    if (Op->getOpcode() == Instruction::AddrSpaceCast)
+      return isSafeToCastConstAddrSpace(cast<Constant>(Op->getOperand(0)), NewAS);
+
+    if (Op->getOpcode() == Instruction::IntToPtr &&
+        Op->getType()->getPointerAddressSpace() == FlatAddrSpace)
+      return true;
+  }
+
+  return false;
+}
+
+static Value::use_iterator skipToNextUser(Value::use_iterator I,
+                                          Value::use_iterator End) {
+  User *CurUser = I->getUser();
+  ++I;
+
+  while (I != End && I->getUser() == CurUser)
+    ++I;
+
+  return I;
+}
+
+bool InferAddressSpaces::rewriteWithNewAddressSpaces(
+  const std::vector<Value *> &Postorder,
+  const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const {
+  // For each address expression to be modified, creates a clone of it with its
+  // pointer operands converted to the new address space. Since the pointer
+  // operands are converted, the clone is naturally in the new address space by
+  // construction.
+  ValueToValueMapTy ValueWithNewAddrSpace;
+  SmallVector<const Use *, 32> UndefUsesToFix;
+  for (Value* V : Postorder) {
+    unsigned NewAddrSpace = InferredAddrSpace.lookup(V);
+    if (V->getType()->getPointerAddressSpace() != NewAddrSpace) {
+      ValueWithNewAddrSpace[V] = cloneValueWithNewAddressSpace(
+        V, NewAddrSpace, ValueWithNewAddrSpace, &UndefUsesToFix);
+    }
+  }
+
+  if (ValueWithNewAddrSpace.empty())
+    return false;
+
+  // Fixes all the undef uses generated by cloneInstructionWithNewAddressSpace.
+  for (const Use *UndefUse : UndefUsesToFix) {
+    User *V = UndefUse->getUser();
+    User *NewV = cast<User>(ValueWithNewAddrSpace.lookup(V));
+    unsigned OperandNo = UndefUse->getOperandNo();
+    assert(isa<UndefValue>(NewV->getOperand(OperandNo)));
+    NewV->setOperand(OperandNo, ValueWithNewAddrSpace.lookup(UndefUse->get()));
+  }
+
+  // Replaces the uses of the old address expressions with the new ones.
+  for (Value *V : Postorder) {
+    Value *NewV = ValueWithNewAddrSpace.lookup(V);
+    if (NewV == nullptr)
+      continue;
+
+    DEBUG(dbgs() << "Replacing the uses of " << *V
+                 << "\n  with\n  " << *NewV << '\n');
+
+    Value::use_iterator I, E, Next;
+    for (I = V->use_begin(), E = V->use_end(); I != E; ) {
+      Use &U = *I;
+
+      // Some users may see the same pointer operand in multiple operands. Skip
+      // to the next instruction.
+      I = skipToNextUser(I, E);
+
+      if (isSimplePointerUseValidToReplace(U)) {
+        // If V is used as the pointer operand of a compatible memory operation,
+        // sets the pointer operand to NewV. This replacement does not change
+        // the element type, so the resultant load/store is still valid.
+        U.set(NewV);
+        continue;
+      }
+
+      User *CurUser = U.getUser();
+      // Handle more complex cases like intrinsic that need to be remangled.
+      if (auto *MI = dyn_cast<MemIntrinsic>(CurUser)) {
+        if (!MI->isVolatile() && handleMemIntrinsicPtrUse(MI, V, NewV))
+          continue;
+      }
+
+      if (auto *II = dyn_cast<IntrinsicInst>(CurUser)) {
+        if (rewriteIntrinsicOperands(II, V, NewV))
+          continue;
+      }
+
+      if (isa<Instruction>(CurUser)) {
+        if (ICmpInst *Cmp = dyn_cast<ICmpInst>(CurUser)) {
+          // If we can infer that both pointers are in the same addrspace,
+          // transform e.g.
+          //   %cmp = icmp eq float* %p, %q
+          // into
+          //   %cmp = icmp eq float addrspace(3)* %new_p, %new_q
+
+          unsigned NewAS = NewV->getType()->getPointerAddressSpace();
+          int SrcIdx = U.getOperandNo();
+          int OtherIdx = (SrcIdx == 0) ? 1 : 0;
+          Value *OtherSrc = Cmp->getOperand(OtherIdx);
+
+          if (Value *OtherNewV = ValueWithNewAddrSpace.lookup(OtherSrc)) {
+            if (OtherNewV->getType()->getPointerAddressSpace() == NewAS) {
+              Cmp->setOperand(OtherIdx, OtherNewV);
+              Cmp->setOperand(SrcIdx, NewV);
+              continue;
+            }
+          }
+
+          // Even if the type mismatches, we can cast the constant.
+          if (auto *KOtherSrc = dyn_cast<Constant>(OtherSrc)) {
+            if (isSafeToCastConstAddrSpace(KOtherSrc, NewAS)) {
+              Cmp->setOperand(SrcIdx, NewV);
+              Cmp->setOperand(OtherIdx,
+                ConstantExpr::getAddrSpaceCast(KOtherSrc, NewV->getType()));
+              continue;
+            }
+          }
+        }
+
+        // Otherwise, replaces the use with flat(NewV).
+        if (Instruction *I = dyn_cast<Instruction>(V)) {
+          BasicBlock::iterator InsertPos = std::next(I->getIterator());
+          while (isa<PHINode>(InsertPos))
+            ++InsertPos;
+          U.set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos));
+        } else {
+          U.set(ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
+                                               V->getType()));
+        }
+      }
+    }
+
+    if (V->use_empty())
+      RecursivelyDeleteTriviallyDeadInstructions(V);
+  }
+
+  return true;
+}
+
+FunctionPass *llvm::createInferAddressSpacesPass() {
+  return new InferAddressSpaces();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 1870c3deb4f3..08eb95a1a3d3 100644
--- a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
@@ -30,11 +31,13 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include <algorithm>
@@ -89,6 +92,7 @@ namespace {
     bool runOnFunction(Function &F) override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<AAResultsWrapperPass>();
       AU.addRequired<LazyValueInfoWrapperPass>();
       AU.addPreserved<LazyValueInfoWrapperPass>();
       AU.addPreserved<GlobalsAAWrapperPass>();
@@ -104,6 +108,7 @@ INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading",
                 "Jump Threading", false, false)
 INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(JumpThreading, "jump-threading",
                 "Jump Threading", false, false)
 
@@ -121,6 +126,7 @@ bool JumpThreading::runOnFunction(Function &F) {
     return false;
   auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   auto LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
+  auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   std::unique_ptr<BlockFrequencyInfo> BFI;
   std::unique_ptr<BranchProbabilityInfo> BPI;
   bool HasProfileData = F.getEntryCount().hasValue();
@@ -129,7 +135,8 @@ bool JumpThreading::runOnFunction(Function &F) {
     BPI.reset(new BranchProbabilityInfo(F, LI));
     BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
   }
-  return Impl.runImpl(F, TLI, LVI, HasProfileData, std::move(BFI),
+
+  return Impl.runImpl(F, TLI, LVI, AA, HasProfileData, std::move(BFI),
                       std::move(BPI));
 }
 
@@ -138,6 +145,8 @@ PreservedAnalyses JumpThreadingPass::run(Function &F,
 
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &LVI = AM.getResult<LazyValueAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+
   std::unique_ptr<BlockFrequencyInfo> BFI;
   std::unique_ptr<BranchProbabilityInfo> BPI;
   bool HasProfileData = F.getEntryCount().hasValue();
@@ -146,12 +155,9 @@ PreservedAnalyses JumpThreadingPass::run(Function &F,
     BPI.reset(new BranchProbabilityInfo(F, LI));
     BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
   }
-  bool Changed =
-      runImpl(F, &TLI, &LVI, HasProfileData, std::move(BFI), std::move(BPI));
 
-  // FIXME: We need to invalidate LVI to avoid PR28400. Is there a better
-  // solution?
-  AM.invalidate<LazyValueAnalysis>(F);
+  bool Changed = runImpl(F, &TLI, &LVI, &AA, HasProfileData, std::move(BFI),
+                         std::move(BPI));
 
   if (!Changed)
     return PreservedAnalyses::all();
@@ -161,18 +167,23 @@ PreservedAnalyses JumpThreadingPass::run(Function &F,
 }
 
 bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
-                                LazyValueInfo *LVI_, bool HasProfileData_,
+                                LazyValueInfo *LVI_, AliasAnalysis *AA_,
+                                bool HasProfileData_,
                                 std::unique_ptr<BlockFrequencyInfo> BFI_,
                                 std::unique_ptr<BranchProbabilityInfo> BPI_) {
 
   DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
   TLI = TLI_;
   LVI = LVI_;
+  AA = AA_;
   BFI.reset();
   BPI.reset();
   // When profile data is available, we need to update edge weights after
   // successful jump threading, which requires both BPI and BFI being available.
   HasProfileData = HasProfileData_;
+  auto *GuardDecl = F.getParent()->getFunction(
+      Intrinsic::getName(Intrinsic::experimental_guard));
+  HasGuards = GuardDecl && !GuardDecl->use_empty();
   if (HasProfileData) {
     BPI = std::move(BPI_);
     BFI = std::move(BFI_);
@@ -226,26 +237,13 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
           BB != &BB->getParent()->getEntryBlock() &&
           // If the terminator is the only non-phi instruction, try to nuke it.
           BB->getFirstNonPHIOrDbg()->isTerminator() && !LoopHeaders.count(BB)) {
-        // Since TryToSimplifyUncondBranchFromEmptyBlock may delete the
-        // block, we have to make sure it isn't in the LoopHeaders set.  We
-        // reinsert afterward if needed.
-        bool ErasedFromLoopHeaders = LoopHeaders.erase(BB);
-        BasicBlock *Succ = BI->getSuccessor(0);
-
         // FIXME: It is always conservatively correct to drop the info
         // for a block even if it doesn't get erased.  This isn't totally
         // awesome, but it allows us to use AssertingVH to prevent nasty
         // dangling pointer issues within LazyValueInfo.
         LVI->eraseBlock(BB);
-        if (TryToSimplifyUncondBranchFromEmptyBlock(BB)) {
+        if (TryToSimplifyUncondBranchFromEmptyBlock(BB))
           Changed = true;
-          // If we deleted BB and BB was the header of a loop, then the
-          // successor is now the header of the loop.
-          BB = Succ;
-        }
-
-        if (ErasedFromLoopHeaders)
-          LoopHeaders.insert(BB);
       }
     }
     EverChanged |= Changed;
@@ -255,10 +253,13 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
   return EverChanged;
 }
 
-/// getJumpThreadDuplicationCost - Return the cost of duplicating this block to
-/// thread across it. Stop scanning the block when passing the threshold.
-static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
+/// Return the cost of duplicating a piece of this block from first non-phi
+/// and before StopAt instruction to thread across it. Stop scanning the block
+/// when exceeding the threshold. If duplication is impossible, returns ~0U.
+static unsigned getJumpThreadDuplicationCost(BasicBlock *BB,
+                                             Instruction *StopAt,
                                              unsigned Threshold) {
+  assert(StopAt->getParent() == BB && "Not an instruction from proper BB?");
   /// Ignore PHI nodes, these will be flattened when duplication happens.
   BasicBlock::const_iterator I(BB->getFirstNonPHI());
 
@@ -266,15 +267,17 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
   // branch, so they shouldn't count against the duplication cost.
 
   unsigned Bonus = 0;
-  const TerminatorInst *BBTerm = BB->getTerminator();
-  // Threading through a switch statement is particularly profitable.  If this
-  // block ends in a switch, decrease its cost to make it more likely to happen.
-  if (isa<SwitchInst>(BBTerm))
-    Bonus = 6;
-
-  // The same holds for indirect branches, but slightly more so.
-  if (isa<IndirectBrInst>(BBTerm))
-    Bonus = 8;
+  if (BB->getTerminator() == StopAt) {
+    // Threading through a switch statement is particularly profitable.  If this
+    // block ends in a switch, decrease its cost to make it more likely to
+    // happen.
+    if (isa<SwitchInst>(StopAt))
+      Bonus = 6;
+
+    // The same holds for indirect branches, but slightly more so.
+    if (isa<IndirectBrInst>(StopAt))
+      Bonus = 8;
+  }
 
   // Bump the threshold up so the early exit from the loop doesn't skip the
   // terminator-based Size adjustment at the end.
@@ -283,7 +286,7 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
   // Sum up the cost of each instruction until we get to the terminator.  Don't
   // include the terminator because the copy won't include it.
   unsigned Size = 0;
-  for (; !isa<TerminatorInst>(I); ++I) {
+  for (; &*I != StopAt; ++I) {
 
     // Stop scanning the block if we've reached the threshold.
     if (Size > Threshold)
@@ -729,6 +732,10 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
   if (TryToUnfoldSelectInCurrBB(BB))
     return true;
 
+  // Look if we can propagate guards to predecessors.
+  if (HasGuards && ProcessGuards(BB))
+    return true;
+
   // What kind of constant we're looking for.
   ConstantPreference Preference = WantInteger;
 
@@ -804,7 +811,6 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
     return false;
   }
 
-
   if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst)) {
     // If we're branching on a conditional, LVI might be able to determine
     // it's value at the branch instruction.  We only handle comparisons
@@ -812,7 +818,12 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
     // TODO: This should be extended to handle switches as well.
     BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
     Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1));
-    if (CondBr && CondConst && CondBr->isConditional()) {
+    if (CondBr && CondConst) {
+      // We should have returned as soon as we turn a conditional branch to
+      // unconditional. Because its no longer interesting as far as jump
+      // threading is concerned.
+      assert(CondBr->isConditional() && "Threading on unconditional terminator");
+
       LazyValueInfo::Tristate Ret =
         LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0),
                             CondConst, CondBr);
@@ -835,10 +846,12 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
         }
         return true;
       }
-    }
 
-    if (CondBr && CondConst && TryToUnfoldSelect(CondCmp, BB))
-      return true;
+      // We did not manage to simplify this branch, try to see whether
+      // CondCmp depends on a known phi-select pattern.
+      if (TryToUnfoldSelect(CondCmp, BB))
+        return true;
+    }
   }
 
   // Check for some cases that are worth simplifying.  Right now we want to look
@@ -857,7 +870,6 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
     if (SimplifyPartiallyRedundantLoad(LI))
       return true;
 
-
   // Handle a variety of cases where we are branching on something derived from
   // a PHI node in the current block.  If we can prove that any predecessors
   // compute a predictable value based on a PHI node, thread those predecessors.
@@ -871,7 +883,6 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
     if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
       return ProcessBranchOnPHI(PN);
 
-
   // If this is an otherwise-unfoldable branch on a XOR, see if we can simplify.
   if (CondInst->getOpcode() == Instruction::Xor &&
       CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
@@ -920,6 +931,14 @@ bool JumpThreadingPass::ProcessImpliedCondition(BasicBlock *BB) {
   return false;
 }
 
+/// Return true if Op is an instruction defined in the given block.
+static bool isOpDefinedInBlock(Value *Op, BasicBlock *BB) {
+  if (Instruction *OpInst = dyn_cast<Instruction>(Op))
+    if (OpInst->getParent() == BB)
+      return true;
+  return false;
+}
+
 /// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant
 /// load instruction, eliminate it by replacing it with a PHI node.  This is an
 /// important optimization that encourages jump threading, and needs to be run
@@ -942,18 +961,17 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
 
   Value *LoadedPtr = LI->getOperand(0);
 
-  // If the loaded operand is defined in the LoadBB, it can't be available.
-  // TODO: Could do simple PHI translation, that would be fun :)
-  if (Instruction *PtrOp = dyn_cast<Instruction>(LoadedPtr))
-    if (PtrOp->getParent() == LoadBB)
-      return false;
+  // If the loaded operand is defined in the LoadBB and its not a phi,
+  // it can't be available in predecessors.
+  if (isOpDefinedInBlock(LoadedPtr, LoadBB) && !isa<PHINode>(LoadedPtr))
+    return false;
 
   // Scan a few instructions up from the load, to see if it is obviously live at
   // the entry to its block.
   BasicBlock::iterator BBIt(LI);
   bool IsLoadCSE;
-  if (Value *AvailableVal =
-        FindAvailableLoadedValue(LI, LoadBB, BBIt, DefMaxInstsToScan, nullptr, &IsLoadCSE)) {
+  if (Value *AvailableVal = FindAvailableLoadedValue(
+          LI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) {
     // If the value of the load is locally available within the block, just use
     // it.  This frequently occurs for reg2mem'd allocas.
 
@@ -997,12 +1015,34 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
     if (!PredsScanned.insert(PredBB).second)
       continue;
 
-    // Scan the predecessor to see if the value is available in the pred.
     BBIt = PredBB->end();
-    Value *PredAvailable = FindAvailableLoadedValue(LI, PredBB, BBIt,
-                                                    DefMaxInstsToScan,
-                                                    nullptr,
-                                                    &IsLoadCSE);
+    unsigned NumScanedInst = 0;
+    Value *PredAvailable = nullptr;
+    // NOTE: We don't CSE load that is volatile or anything stronger than
+    // unordered, that should have been checked when we entered the function.
+    assert(LI->isUnordered() && "Attempting to CSE volatile or atomic loads");
+    // If this is a load on a phi pointer, phi-translate it and search
+    // for available load/store to the pointer in predecessors.
+    Value *Ptr = LoadedPtr->DoPHITranslation(LoadBB, PredBB);
+    PredAvailable = FindAvailablePtrLoadStore(
+        Ptr, LI->getType(), LI->isAtomic(), PredBB, BBIt, DefMaxInstsToScan,
+        AA, &IsLoadCSE, &NumScanedInst);
+
+    // If PredBB has a single predecessor, continue scanning through the
+    // single precessor.
+    BasicBlock *SinglePredBB = PredBB;
+    while (!PredAvailable && SinglePredBB && BBIt == SinglePredBB->begin() &&
+           NumScanedInst < DefMaxInstsToScan) {
+      SinglePredBB = SinglePredBB->getSinglePredecessor();
+      if (SinglePredBB) {
+        BBIt = SinglePredBB->end();
+        PredAvailable = FindAvailablePtrLoadStore(
+            Ptr, LI->getType(), LI->isAtomic(), SinglePredBB, BBIt,
+            (DefMaxInstsToScan - NumScanedInst), AA, &IsLoadCSE,
+            &NumScanedInst);
+      }
+    }
+
     if (!PredAvailable) {
       OneUnavailablePred = PredBB;
       continue;
@@ -1062,10 +1102,10 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   if (UnavailablePred) {
     assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 &&
            "Can't handle critical edge here!");
-    LoadInst *NewVal =
-        new LoadInst(LoadedPtr, LI->getName() + ".pr", false,
-                     LI->getAlignment(), LI->getOrdering(), LI->getSynchScope(),
-                     UnavailablePred->getTerminator());
+    LoadInst *NewVal = new LoadInst(
+        LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred),
+        LI->getName() + ".pr", false, LI->getAlignment(), LI->getOrdering(),
+        LI->getSynchScope(), UnavailablePred->getTerminator());
     NewVal->setDebugLoc(LI->getDebugLoc());
     if (AATags)
       NewVal->setAAMetadata(AATags);
@@ -1229,7 +1269,7 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
     else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()))
       DestBB = BI->getSuccessor(cast<ConstantInt>(Val)->isZero());
     else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
-      DestBB = SI->findCaseValue(cast<ConstantInt>(Val)).getCaseSuccessor();
+      DestBB = SI->findCaseValue(cast<ConstantInt>(Val))->getCaseSuccessor();
     } else {
       assert(isa<IndirectBrInst>(BB->getTerminator())
               && "Unexpected terminator");
@@ -1468,7 +1508,8 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
     return false;
   }
 
-  unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB, BBDupThreshold);
+  unsigned JumpThreadCost =
+      getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
   if (JumpThreadCost > BBDupThreshold) {
     DEBUG(dbgs() << "  Not threading BB '" << BB->getName()
           << "' - Cost is too high: " << JumpThreadCost << "\n");
@@ -1756,7 +1797,8 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
     return false;
   }
 
-  unsigned DuplicationCost = getJumpThreadDuplicationCost(BB, BBDupThreshold);
+  unsigned DuplicationCost =
+      getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
   if (DuplicationCost > BBDupThreshold) {
     DEBUG(dbgs() << "  Not duplicating BB '" << BB->getName()
           << "' - Cost is too high: " << DuplicationCost << "\n");
@@ -1888,10 +1930,10 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
 /// TryToUnfoldSelect - Look for blocks of the form
 /// bb1:
 ///   %a = select
-///   br bb
+///   br bb2
 ///
 /// bb2:
-///   %p = phi [%a, %bb] ...
+///   %p = phi [%a, %bb1] ...
 ///   %c = icmp %p
 ///   br i1 %c
 ///
@@ -2021,3 +2063,130 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
   
   return false;
 }
+
+/// Try to propagate a guard from the current BB into one of its predecessors
+/// in case if another branch of execution implies that the condition of this
+/// guard is always true. Currently we only process the simplest case that
+/// looks like:
+///
+/// Start:
+///   %cond = ...
+///   br i1 %cond, label %T1, label %F1
+/// T1:
+///   br label %Merge
+/// F1:
+///   br label %Merge
+/// Merge:
+///   %condGuard = ...
+///   call void(i1, ...) @llvm.experimental.guard( i1 %condGuard )[ "deopt"() ]
+///
+/// And cond either implies condGuard or !condGuard. In this case all the
+/// instructions before the guard can be duplicated in both branches, and the
+/// guard is then threaded to one of them.
+bool JumpThreadingPass::ProcessGuards(BasicBlock *BB) {
+  using namespace PatternMatch;
+  // We only want to deal with two predecessors.
+  BasicBlock *Pred1, *Pred2;
+  auto PI = pred_begin(BB), PE = pred_end(BB);
+  if (PI == PE)
+    return false;
+  Pred1 = *PI++;
+  if (PI == PE)
+    return false;
+  Pred2 = *PI++;
+  if (PI != PE)
+    return false;
+  if (Pred1 == Pred2)
+    return false;
+
+  // Try to thread one of the guards of the block.
+  // TODO: Look up deeper than to immediate predecessor?
+  auto *Parent = Pred1->getSinglePredecessor();
+  if (!Parent || Parent != Pred2->getSinglePredecessor())
+    return false;
+
+  if (auto *BI = dyn_cast<BranchInst>(Parent->getTerminator()))
+    for (auto &I : *BB)
+      if (match(&I, m_Intrinsic<Intrinsic::experimental_guard>()))
+        if (ThreadGuard(BB, cast<IntrinsicInst>(&I), BI))
+          return true;
+
+  return false;
+}
+
+/// Try to propagate the guard from BB which is the lower block of a diamond
+/// to one of its branches, in case if diamond's condition implies guard's
+/// condition.
+bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard,
+                                    BranchInst *BI) {
+  assert(BI->getNumSuccessors() == 2 && "Wrong number of successors?");
+  assert(BI->isConditional() && "Unconditional branch has 2 successors?");
+  Value *GuardCond = Guard->getArgOperand(0);
+  Value *BranchCond = BI->getCondition();
+  BasicBlock *TrueDest = BI->getSuccessor(0);
+  BasicBlock *FalseDest = BI->getSuccessor(1);
+
+  auto &DL = BB->getModule()->getDataLayout();
+  bool TrueDestIsSafe = false;
+  bool FalseDestIsSafe = false;
+
+  // True dest is safe if BranchCond => GuardCond.
+  auto Impl = isImpliedCondition(BranchCond, GuardCond, DL);
+  if (Impl && *Impl)
+    TrueDestIsSafe = true;
+  else {
+    // False dest is safe if !BranchCond => GuardCond.
+    Impl =
+        isImpliedCondition(BranchCond, GuardCond, DL, /* InvertAPred */ true);
+    if (Impl && *Impl)
+      FalseDestIsSafe = true;
+  }
+
+  if (!TrueDestIsSafe && !FalseDestIsSafe)
+    return false;
+
+  BasicBlock *UnguardedBlock = TrueDestIsSafe ? TrueDest : FalseDest;
+  BasicBlock *GuardedBlock = FalseDestIsSafe ? TrueDest : FalseDest;
+
+  ValueToValueMapTy UnguardedMapping, GuardedMapping;
+  Instruction *AfterGuard = Guard->getNextNode();
+  unsigned Cost = getJumpThreadDuplicationCost(BB, AfterGuard, BBDupThreshold);
+  if (Cost > BBDupThreshold)
+    return false;
+  // Duplicate all instructions before the guard and the guard itself to the
+  // branch where implication is not proved.
+  GuardedBlock = DuplicateInstructionsInSplitBetween(
+      BB, GuardedBlock, AfterGuard, GuardedMapping);
+  assert(GuardedBlock && "Could not create the guarded block?");
+  // Duplicate all instructions before the guard in the unguarded branch.
+  // Since we have successfully duplicated the guarded block and this block
+  // has fewer instructions, we expect it to succeed.
+  UnguardedBlock = DuplicateInstructionsInSplitBetween(BB, UnguardedBlock,
+                                                       Guard, UnguardedMapping);
+  assert(UnguardedBlock && "Could not create the unguarded block?");
+  DEBUG(dbgs() << "Moved guard " << *Guard << " to block "
+               << GuardedBlock->getName() << "\n");
+
+  // Some instructions before the guard may still have uses. For them, we need
+  // to create Phi nodes merging their copies in both guarded and unguarded
+  // branches. Those instructions that have no uses can be just removed.
+  SmallVector<Instruction *, 4> ToRemove;
+  for (auto BI = BB->begin(); &*BI != AfterGuard; ++BI)
+    if (!isa<PHINode>(&*BI))
+      ToRemove.push_back(&*BI);
+
+  Instruction *InsertionPoint = &*BB->getFirstInsertionPt();
+  assert(InsertionPoint && "Empty block?");
+  // Substitute with Phis & remove.
+  for (auto *Inst : reverse(ToRemove)) {
+    if (!Inst->use_empty()) {
+      PHINode *NewPN = PHINode::Create(Inst->getType(), 2);
+      NewPN->addIncoming(UnguardedMapping[Inst], UnguardedBlock);
+      NewPN->addIncoming(GuardedMapping[Inst], GuardedBlock);
+      NewPN->insertBefore(InsertionPoint);
+      Inst->replaceAllUsesWith(NewPN);
+    }
+    Inst->eraseFromParent();
+  }
+  return true;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
index f51d11c04cb2..340c81fed0fd 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -77,10 +77,16 @@ STATISTIC(NumMovedLoads, "Number of load insts hoisted or sunk");
 STATISTIC(NumMovedCalls, "Number of call insts hoisted or sunk");
 STATISTIC(NumPromoted, "Number of memory locations promoted to registers");
 
+/// Memory promotion is enabled by default.
 static cl::opt<bool>
-    DisablePromotion("disable-licm-promotion", cl::Hidden,
+    DisablePromotion("disable-licm-promotion", cl::Hidden, cl::init(false),
                      cl::desc("Disable memory promotion in LICM pass"));
 
+static cl::opt<uint32_t> MaxNumUsesTraversed(
+    "licm-max-num-uses-traversed", cl::Hidden, cl::init(8),
+    cl::desc("Max num uses visited for identifying load "
+             "invariance in loop using invariant start (default = 8)"));
+
 static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
 static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
                             const LoopSafetyInfo *SafetyInfo);
@@ -201,9 +207,9 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
   if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.SE, ORE, true))
     return PreservedAnalyses::all();
 
-  // FIXME: There is no setPreservesCFG in the new PM. When that becomes
-  // available, it should be used here.
-  return getLoopPassPreservedAnalyses();
+  auto PA = getLoopPassPreservedAnalyses();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
 }
 
 char LegacyLICMPass::ID = 0;
@@ -425,6 +431,29 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
         continue;
       }
 
+      // Attempt to remove floating point division out of the loop by converting
+      // it to a reciprocal multiplication.
+      if (I.getOpcode() == Instruction::FDiv &&
+          CurLoop->isLoopInvariant(I.getOperand(1)) &&
+          I.hasAllowReciprocal()) {
+        auto Divisor = I.getOperand(1);
+        auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0);
+        auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor);
+        ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags());
+        ReciprocalDivisor->insertBefore(&I);
+
+        auto Product = BinaryOperator::CreateFMul(I.getOperand(0),
+                                                  ReciprocalDivisor);
+        Product->setFastMathFlags(I.getFastMathFlags());
+        Product->insertAfter(&I);
+        I.replaceAllUsesWith(Product);
+        I.eraseFromParent();
+
+        hoist(*ReciprocalDivisor, DT, CurLoop, SafetyInfo, ORE);
+        Changed = true;
+        continue;
+      }
+
       // Try hoisting the instruction out to the preheader.  We can only do this
       // if all of the operands of the instruction are loop invariant and if it
       // is safe to hoist the instruction.
@@ -461,7 +490,10 @@ void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) {
 
   SafetyInfo->MayThrow = SafetyInfo->HeaderMayThrow;
   // Iterate over loop instructions and compute safety info.
-  for (Loop::block_iterator BB = CurLoop->block_begin(),
+  // Skip header as it has been computed and stored in HeaderMayThrow.
+  // The first block in loopinfo.Blocks is guaranteed to be the header.
+  assert(Header == *CurLoop->getBlocks().begin() && "First block must be header");
+  for (Loop::block_iterator BB = std::next(CurLoop->block_begin()),
                             BBE = CurLoop->block_end();
        (BB != BBE) && !SafetyInfo->MayThrow; ++BB)
     for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end();
@@ -477,6 +509,59 @@ void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) {
         SafetyInfo->BlockColors = colorEHFunclets(*Fn);
 }
 
+// Return true if LI is invariant within scope of the loop. LI is invariant if
+// CurLoop is dominated by an invariant.start representing the same memory location
+// and size as the memory location LI loads from, and also the invariant.start
+// has no uses.
+static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
+                                  Loop *CurLoop) {
+  Value *Addr = LI->getOperand(0);
+  const DataLayout &DL = LI->getModule()->getDataLayout();
+  const uint32_t LocSizeInBits = DL.getTypeSizeInBits(
+      cast<PointerType>(Addr->getType())->getElementType());
+
+  // if the type is i8 addrspace(x)*, we know this is the type of
+  // llvm.invariant.start operand
+  auto *PtrInt8Ty = PointerType::get(Type::getInt8Ty(LI->getContext()),
+                                     LI->getPointerAddressSpace());
+  unsigned BitcastsVisited = 0;
+  // Look through bitcasts until we reach the i8* type (this is invariant.start
+  // operand type).
+  while (Addr->getType() != PtrInt8Ty) {
+    auto *BC = dyn_cast<BitCastInst>(Addr);
+    // Avoid traversing high number of bitcast uses.
+    if (++BitcastsVisited > MaxNumUsesTraversed || !BC)
+      return false;
+    Addr = BC->getOperand(0);
+  }
+
+  unsigned UsesVisited = 0;
+  // Traverse all uses of the load operand value, to see if invariant.start is
+  // one of the uses, and whether it dominates the load instruction.
+  for (auto *U : Addr->users()) {
+    // Avoid traversing for Load operand with high number of users.
+    if (++UsesVisited > MaxNumUsesTraversed)
+      return false;
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
+    // If there are escaping uses of invariant.start instruction, the load maybe
+    // non-invariant.
+    if (!II || II->getIntrinsicID() != Intrinsic::invariant_start ||
+        II->hasNUsesOrMore(1))
+      continue;
+    unsigned InvariantSizeInBits =
+        cast<ConstantInt>(II->getArgOperand(0))->getSExtValue() * 8;
+    // Confirm the invariant.start location size contains the load operand size
+    // in bits. Also, the invariant.start should dominate the load, and we
+    // should not hoist the load out of a loop that contains this dominating
+    // invariant.start.
+    if (LocSizeInBits <= InvariantSizeInBits &&
+        DT->properlyDominates(II->getParent(), CurLoop->getHeader()))
+      return true;
+  }
+
+  return false;
+}
+
 bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
                               Loop *CurLoop, AliasSetTracker *CurAST,
                               LoopSafetyInfo *SafetyInfo,
@@ -493,6 +578,10 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
     if (LI->getMetadata(LLVMContext::MD_invariant_load))
       return true;
 
+    // This checks for an invariant.start dominating the load.
+    if (isLoadInvariantInLoop(LI, DT, CurLoop))
+      return true;
+
     // Don't hoist loads which have may-aliased stores in loop.
     uint64_t Size = 0;
     if (LI->getType()->isSized())
@@ -782,7 +871,7 @@ static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
   DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I
                << "\n");
   ORE->emit(OptimizationRemark(DEBUG_TYPE, "Hoisted", &I)
-            << "hosting " << ore::NV("Inst", &I));
+            << "hoisting " << ore::NV("Inst", &I));
 
   // Metadata can be dependent on conditions we are hoisting above.
   // Conservatively strip all metadata on the instruction unless we were
@@ -852,6 +941,7 @@ class LoopPromoter : public LoadAndStorePromoter {
   LoopInfo &LI;
   DebugLoc DL;
   int Alignment;
+  bool UnorderedAtomic;
   AAMDNodes AATags;
 
   Value *maybeInsertLCSSAPHI(Value *V, BasicBlock *BB) const {
@@ -875,10 +965,11 @@ public:
                SmallVectorImpl<BasicBlock *> &LEB,
                SmallVectorImpl<Instruction *> &LIP, PredIteratorCache &PIC,
                AliasSetTracker &ast, LoopInfo &li, DebugLoc dl, int alignment,
-               const AAMDNodes &AATags)
+               bool UnorderedAtomic, const AAMDNodes &AATags)
       : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
         LoopExitBlocks(LEB), LoopInsertPts(LIP), PredCache(PIC), AST(ast),
-        LI(li), DL(std::move(dl)), Alignment(alignment), AATags(AATags) {}
+        LI(li), DL(std::move(dl)), Alignment(alignment),
+        UnorderedAtomic(UnorderedAtomic),AATags(AATags) {}
 
   bool isInstInList(Instruction *I,
                     const SmallVectorImpl<Instruction *> &) const override {
@@ -902,6 +993,8 @@ public:
       Value *Ptr = maybeInsertLCSSAPHI(SomePtr, ExitBlock);
       Instruction *InsertPos = LoopInsertPts[i];
       StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos);
+      if (UnorderedAtomic)
+        NewSI->setOrdering(AtomicOrdering::Unordered);
       NewSI->setAlignment(Alignment);
       NewSI->setDebugLoc(DL);
       if (AATags)
@@ -992,18 +1085,41 @@ bool llvm::promoteLoopAccessesToScalars(
   // We start with an alignment of one and try to find instructions that allow
   // us to prove better alignment.
   unsigned Alignment = 1;
+  // Keep track of which types of access we see
+  bool SawUnorderedAtomic = false; 
+  bool SawNotAtomic = false;
   AAMDNodes AATags;
 
   const DataLayout &MDL = Preheader->getModule()->getDataLayout();
 
+  // Do we know this object does not escape ?
+  bool IsKnownNonEscapingObject = false;
   if (SafetyInfo->MayThrow) {
     // If a loop can throw, we have to insert a store along each unwind edge.
     // That said, we can't actually make the unwind edge explicit. Therefore,
     // we have to prove that the store is dead along the unwind edge.
     //
-    // Currently, this code just special-cases alloca instructions.
-    if (!isa<AllocaInst>(GetUnderlyingObject(SomePtr, MDL)))
-      return false;
+    // If the underlying object is not an alloca, nor a pointer that does not
+    // escape, then we can not effectively prove that the store is dead along
+    // the unwind edge. i.e. the caller of this function could have ways to
+    // access the pointed object.
+    Value *Object = GetUnderlyingObject(SomePtr, MDL);
+    // If this is a base pointer we do not understand, simply bail.
+    // We only handle alloca and return value from alloc-like fn right now.
+    if (!isa<AllocaInst>(Object)) {
+        if (!isAllocLikeFn(Object, TLI))
+          return false;
+      // If this is an alloc like fn. There are more constraints we need to verify.
+      // More specifically, we must make sure that the pointer can not escape.
+      //
+      // NOTE: PointerMayBeCaptured is not enough as the pointer may have escaped
+      // even though its not captured by the enclosing function. Standard allocation
+      // functions like malloc, calloc, and operator new return values which can
+      // be assumed not to have previously escaped.
+      if (PointerMayBeCaptured(Object, true, true))
+        return false;
+      IsKnownNonEscapingObject = true;
+    }
   }
 
   // Check that all of the pointers in the alias set have the same type.  We
@@ -1029,8 +1145,11 @@ bool llvm::promoteLoopAccessesToScalars(
       // it.
       if (LoadInst *Load = dyn_cast<LoadInst>(UI)) {
         assert(!Load->isVolatile() && "AST broken");
-        if (!Load->isSimple())
+        if (!Load->isUnordered())
           return false;
+        
+        SawUnorderedAtomic |= Load->isAtomic();
+        SawNotAtomic |= !Load->isAtomic();
 
         if (!DereferenceableInPH)
           DereferenceableInPH = isSafeToExecuteUnconditionally(
@@ -1041,9 +1160,12 @@ bool llvm::promoteLoopAccessesToScalars(
         if (UI->getOperand(1) != ASIV)
           continue;
         assert(!Store->isVolatile() && "AST broken");
-        if (!Store->isSimple())
+        if (!Store->isUnordered())
           return false;
 
+        SawUnorderedAtomic |= Store->isAtomic();
+        SawNotAtomic |= !Store->isAtomic();
+
         // If the store is guaranteed to execute, both properties are satisfied.
         // We may want to check if a store is guaranteed to execute even if we
         // already know that promotion is safe, since it may have higher
@@ -1096,6 +1218,12 @@ bool llvm::promoteLoopAccessesToScalars(
     }
   }
 
+  // If we found both an unordered atomic instruction and a non-atomic memory
+  // access, bail.  We can't blindly promote non-atomic to atomic since we
+  // might not be able to lower the result.  We can't downgrade since that
+  // would violate memory model.  Also, align 0 is an error for atomics.
+  if (SawUnorderedAtomic && SawNotAtomic)
+    return false;
 
   // If we couldn't prove we can hoist the load, bail.
   if (!DereferenceableInPH)
@@ -1106,10 +1234,15 @@ bool llvm::promoteLoopAccessesToScalars(
   // stores along paths which originally didn't have them without violating the
   // memory model.
   if (!SafeToInsertStore) {
-    Value *Object = GetUnderlyingObject(SomePtr, MDL);
-    SafeToInsertStore =
-        (isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) &&
+    // If this is a known non-escaping object, it is safe to insert the stores.
+    if (IsKnownNonEscapingObject)
+      SafeToInsertStore = true;
+    else {
+      Value *Object = GetUnderlyingObject(SomePtr, MDL);
+      SafeToInsertStore =
+        (isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) && 
         !PointerMayBeCaptured(Object, true, true);
+    }
   }
 
   // If we've still failed to prove we can sink the store, give up.
@@ -1134,12 +1267,15 @@ bool llvm::promoteLoopAccessesToScalars(
   SmallVector<PHINode *, 16> NewPHIs;
   SSAUpdater SSA(&NewPHIs);
   LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks,
-                        InsertPts, PIC, *CurAST, *LI, DL, Alignment, AATags);
+                        InsertPts, PIC, *CurAST, *LI, DL, Alignment,
+                        SawUnorderedAtomic, AATags);
 
   // Set up the preheader to have a definition of the value.  It is the live-out
   // value from the preheader that uses in the loop will use.
   LoadInst *PreheaderLoad = new LoadInst(
       SomePtr, SomePtr->getName() + ".promoted", Preheader->getTerminator());
+  if (SawUnorderedAtomic)
+    PreheaderLoad->setOrdering(AtomicOrdering::Unordered);
   PreheaderLoad->setAlignment(Alignment);
   PreheaderLoad->setDebugLoc(DL);
   if (AATags)
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp b/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp
index 389f1c595aa4..02215d3450c2 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetFolder.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
@@ -53,18 +54,20 @@ struct LoadPOPPair {
 class LoadCombine : public BasicBlockPass {
   LLVMContext *C;
   AliasAnalysis *AA;
+  DominatorTree *DT;
 
 public:
   LoadCombine() : BasicBlockPass(ID), C(nullptr), AA(nullptr) {
     initializeLoadCombinePass(*PassRegistry::getPassRegistry());
   }
-  
+
   using llvm::Pass::doInitialization;
   bool doInitialization(Function &) override;
   bool runOnBasicBlock(BasicBlock &BB) override;
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
 
@@ -234,6 +237,14 @@ bool LoadCombine::runOnBasicBlock(BasicBlock &BB) {
     return false;
 
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  // Skip analysing dead blocks (not forward reachable from function entry).
+  if (!DT->isReachableFromEntry(&BB)) {
+    DEBUG(dbgs() << "LC: skipping unreachable " << BB.getName() <<
+          " in " << BB.getParent()->getName() << "\n");
+    return false;
+  }
 
   IRBuilder<TargetFolder> TheBuilder(
       BB.getContext(), TargetFolder(BB.getModule()->getDataLayout()));
@@ -245,13 +256,17 @@ bool LoadCombine::runOnBasicBlock(BasicBlock &BB) {
   bool Combined = false;
   unsigned Index = 0;
   for (auto &I : BB) {
-    if (I.mayThrow() || (I.mayWriteToMemory() && AST.containsUnknown(&I))) {
+    if (I.mayThrow() || AST.containsUnknown(&I)) {
       if (combineLoads(LoadMap))
         Combined = true;
       LoadMap.clear();
       AST.clear();
       continue;
     }
+    if (I.mayWriteToMemory()) {
+      AST.add(&I);
+      continue;
+    }
     LoadInst *LI = dyn_cast<LoadInst>(&I);
     if (!LI)
       continue;
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
index cca75a365024..73e8ce0e1d93 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -29,32 +29,31 @@ using namespace llvm;
 
 STATISTIC(NumDeleted, "Number of loops deleted");
 
-/// isLoopDead - Determined if a loop is dead.  This assumes that we've already
-/// checked for unique exit and exiting blocks, and that the code is in LCSSA
-/// form.
-bool LoopDeletionPass::isLoopDead(Loop *L, ScalarEvolution &SE,
-                                  SmallVectorImpl<BasicBlock *> &exitingBlocks,
-                                  SmallVectorImpl<BasicBlock *> &exitBlocks,
-                                  bool &Changed, BasicBlock *Preheader) {
-  BasicBlock *exitBlock = exitBlocks[0];
-
+/// Determines if a loop is dead.
+///
+/// This assumes that we've already checked for unique exit and exiting blocks,
+/// and that the code is in LCSSA form.
+static bool isLoopDead(Loop *L, ScalarEvolution &SE,
+                       SmallVectorImpl<BasicBlock *> &ExitingBlocks,
+                       BasicBlock *ExitBlock, bool &Changed,
+                       BasicBlock *Preheader) {
   // Make sure that all PHI entries coming from the loop are loop invariant.
   // Because the code is in LCSSA form, any values used outside of the loop
   // must pass through a PHI in the exit block, meaning that this check is
   // sufficient to guarantee that no loop-variant values are used outside
   // of the loop.
-  BasicBlock::iterator BI = exitBlock->begin();
+  BasicBlock::iterator BI = ExitBlock->begin();
   bool AllEntriesInvariant = true;
   bool AllOutgoingValuesSame = true;
   while (PHINode *P = dyn_cast<PHINode>(BI)) {
-    Value *incoming = P->getIncomingValueForBlock(exitingBlocks[0]);
+    Value *incoming = P->getIncomingValueForBlock(ExitingBlocks[0]);
 
     // Make sure all exiting blocks produce the same incoming value for the exit
     // block.  If there are different incoming values for different exiting
     // blocks, then it is impossible to statically determine which value should
     // be used.
     AllOutgoingValuesSame =
-        all_of(makeArrayRef(exitingBlocks).slice(1), [&](BasicBlock *BB) {
+        all_of(makeArrayRef(ExitingBlocks).slice(1), [&](BasicBlock *BB) {
           return incoming == P->getIncomingValueForBlock(BB);
         });
 
@@ -78,33 +77,37 @@ bool LoopDeletionPass::isLoopDead(Loop *L, ScalarEvolution &SE,
 
   // Make sure that no instructions in the block have potential side-effects.
   // This includes instructions that could write to memory, and loads that are
-  // marked volatile.  This could be made more aggressive by using aliasing
-  // information to identify readonly and readnone calls.
-  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
-       LI != LE; ++LI) {
-    for (Instruction &I : **LI) {
-      if (I.mayHaveSideEffects())
-        return false;
-    }
-  }
-
+  // marked volatile.
+  for (auto &I : L->blocks())
+    if (any_of(*I, [](Instruction &I) { return I.mayHaveSideEffects(); }))
+      return false;
   return true;
 }
 
-/// Remove dead loops, by which we mean loops that do not impact the observable
-/// behavior of the program other than finite running time.  Note we do ensure
-/// that this never remove a loop that might be infinite, as doing so could
-/// change the halting/non-halting nature of a program. NOTE: This entire
-/// process relies pretty heavily on LoopSimplify and LCSSA in order to make
-/// various safety checks work.
-bool LoopDeletionPass::runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
-                               LoopInfo &loopInfo) {
+/// Remove a loop if it is dead.
+///
+/// A loop is considered dead if it does not impact the observable behavior of
+/// the program other than finite running time. This never removes a loop that
+/// might be infinite, as doing so could change the halting/non-halting nature
+/// of a program.
+///
+/// This entire process relies pretty heavily on LoopSimplify form and LCSSA in
+/// order to make various safety checks work.
+///
+/// \returns true if any changes were made. This may mutate the loop even if it
+/// is unable to delete it due to hoisting trivially loop invariant
+/// instructions out of the loop.
+///
+/// This also updates the relevant analysis information in \p DT, \p SE, and \p
+/// LI. It also updates the loop PM if an updater struct is provided.
+static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
+                             LoopInfo &LI, LPMUpdater *Updater = nullptr) {
   assert(L->isLCSSAForm(DT) && "Expected LCSSA!");
 
   // We can only remove the loop if there is a preheader that we can
   // branch from after removing it.
-  BasicBlock *preheader = L->getLoopPreheader();
-  if (!preheader)
+  BasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader)
     return false;
 
   // If LoopSimplify form is not available, stay out of trouble.
@@ -116,22 +119,20 @@ bool LoopDeletionPass::runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
   if (L->begin() != L->end())
     return false;
 
-  SmallVector<BasicBlock *, 4> exitingBlocks;
-  L->getExitingBlocks(exitingBlocks);
-
-  SmallVector<BasicBlock *, 4> exitBlocks;
-  L->getUniqueExitBlocks(exitBlocks);
+  SmallVector<BasicBlock *, 4> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
 
   // We require that the loop only have a single exit block.  Otherwise, we'd
   // be in the situation of needing to be able to solve statically which exit
   // block will be branched to, or trying to preserve the branching logic in
   // a loop invariant manner.
-  if (exitBlocks.size() != 1)
+  BasicBlock *ExitBlock = L->getUniqueExitBlock();
+  if (!ExitBlock)
     return false;
 
   // Finally, we have to check that the loop really is dead.
   bool Changed = false;
-  if (!isLoopDead(L, SE, exitingBlocks, exitBlocks, Changed, preheader))
+  if (!isLoopDead(L, SE, ExitingBlocks, ExitBlock, Changed, Preheader))
     return Changed;
 
   // Don't remove loops for which we can't solve the trip count.
@@ -142,11 +143,13 @@ bool LoopDeletionPass::runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
 
   // Now that we know the removal is safe, remove the loop by changing the
   // branch from the preheader to go to the single exit block.
-  BasicBlock *exitBlock = exitBlocks[0];
-
+  //
   // Because we're deleting a large chunk of code at once, the sequence in which
-  // we remove things is very important to avoid invalidation issues.  Don't
-  // mess with this unless you have good reason and know what you're doing.
+  // we remove things is very important to avoid invalidation issues.
+
+  // If we have an LPM updater, tell it about the loop being removed.
+  if (Updater)
+    Updater->markLoopAsDeleted(*L);
 
   // Tell ScalarEvolution that the loop is deleted. Do this before
   // deleting the loop so that ScalarEvolution can look at the loop
@@ -154,19 +157,19 @@ bool LoopDeletionPass::runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
   SE.forgetLoop(L);
 
   // Connect the preheader directly to the exit block.
-  TerminatorInst *TI = preheader->getTerminator();
-  TI->replaceUsesOfWith(L->getHeader(), exitBlock);
+  TerminatorInst *TI = Preheader->getTerminator();
+  TI->replaceUsesOfWith(L->getHeader(), ExitBlock);
 
   // Rewrite phis in the exit block to get their inputs from
   // the preheader instead of the exiting block.
-  BasicBlock *exitingBlock = exitingBlocks[0];
-  BasicBlock::iterator BI = exitBlock->begin();
+  BasicBlock *ExitingBlock = ExitingBlocks[0];
+  BasicBlock::iterator BI = ExitBlock->begin();
   while (PHINode *P = dyn_cast<PHINode>(BI)) {
-    int j = P->getBasicBlockIndex(exitingBlock);
+    int j = P->getBasicBlockIndex(ExitingBlock);
     assert(j >= 0 && "Can't find exiting block in exit block's phi node!");
-    P->setIncomingBlock(j, preheader);
-    for (unsigned i = 1; i < exitingBlocks.size(); ++i)
-      P->removeIncomingValue(exitingBlocks[i]);
+    P->setIncomingBlock(j, Preheader);
+    for (unsigned i = 1; i < ExitingBlocks.size(); ++i)
+      P->removeIncomingValue(ExitingBlocks[i]);
     ++BI;
   }
 
@@ -175,11 +178,11 @@ bool LoopDeletionPass::runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
   SmallVector<DomTreeNode*, 8> ChildNodes;
   for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
        LI != LE; ++LI) {
-    // Move all of the block's children to be children of the preheader, which
+    // Move all of the block's children to be children of the Preheader, which
     // allows us to remove the domtree entry for the block.
     ChildNodes.insert(ChildNodes.begin(), DT[*LI]->begin(), DT[*LI]->end());
     for (DomTreeNode *ChildNode : ChildNodes) {
-      DT.changeImmediateDominator(ChildNode, DT[preheader]);
+      DT.changeImmediateDominator(ChildNode, DT[Preheader]);
     }
 
     ChildNodes.clear();
@@ -204,22 +207,19 @@ bool LoopDeletionPass::runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
   SmallPtrSet<BasicBlock *, 8> blocks;
   blocks.insert(L->block_begin(), L->block_end());
   for (BasicBlock *BB : blocks)
-    loopInfo.removeBlock(BB);
+    LI.removeBlock(BB);
 
   // The last step is to update LoopInfo now that we've eliminated this loop.
-  loopInfo.markAsRemoved(L);
-  Changed = true;
-
+  LI.markAsRemoved(L);
   ++NumDeleted;
 
-  return Changed;
+  return true;
 }
 
 PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM,
                                         LoopStandardAnalysisResults &AR,
-                                        LPMUpdater &) {
-  bool Changed = runImpl(&L, AR.DT, AR.SE, AR.LI);
-  if (!Changed)
+                                        LPMUpdater &Updater) {
+  if (!deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI, &Updater))
     return PreservedAnalyses::all();
 
   return getLoopPassPreservedAnalyses();
@@ -257,8 +257,7 @@ bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &) {
 
   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-  LoopInfo &loopInfo = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
 
-  LoopDeletionPass Impl;
-  return Impl.runImpl(L, DT, SE, loopInfo);
+  return deleteLoopIfDead(L, DT, SE, LI);
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
index 19716b28ad66..3624bba10345 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -812,29 +812,29 @@ private:
       const RuntimePointerChecking *RtPtrChecking) {
     SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
 
-    std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks),
-                 [&](const RuntimePointerChecking::PointerCheck &Check) {
-                   for (unsigned PtrIdx1 : Check.first->Members)
-                     for (unsigned PtrIdx2 : Check.second->Members)
-                       // Only include this check if there is a pair of pointers
-                       // that require checking and the pointers fall into
-                       // separate partitions.
-                       //
-                       // (Note that we already know at this point that the two
-                       // pointer groups need checking but it doesn't follow
-                       // that each pair of pointers within the two groups need
-                       // checking as well.
-                       //
-                       // In other words we don't want to include a check just
-                       // because there is a pair of pointers between the two
-                       // pointer groups that require checks and a different
-                       // pair whose pointers fall into different partitions.)
-                       if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
-                           !RuntimePointerChecking::arePointersInSamePartition(
-                               PtrToPartition, PtrIdx1, PtrIdx2))
-                         return true;
-                   return false;
-                 });
+    copy_if(AllChecks, std::back_inserter(Checks),
+            [&](const RuntimePointerChecking::PointerCheck &Check) {
+              for (unsigned PtrIdx1 : Check.first->Members)
+                for (unsigned PtrIdx2 : Check.second->Members)
+                  // Only include this check if there is a pair of pointers
+                  // that require checking and the pointers fall into
+                  // separate partitions.
+                  //
+                  // (Note that we already know at this point that the two
+                  // pointer groups need checking but it doesn't follow
+                  // that each pair of pointers within the two groups need
+                  // checking as well.
+                  //
+                  // In other words we don't want to include a check just
+                  // because there is a pair of pointers between the two
+                  // pointer groups that require checks and a different
+                  // pair whose pointers fall into different partitions.)
+                  if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
+                      !RuntimePointerChecking::arePointersInSamePartition(
+                          PtrToPartition, PtrIdx1, PtrIdx2))
+                    return true;
+              return false;
+            });
 
     return Checks;
   }
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 5fec51c095d0..946d85d7360f 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -236,9 +236,9 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) {
   ApplyCodeSizeHeuristics =
       L->getHeader()->getParent()->optForSize() && UseLIRCodeSizeHeurs;
 
-  HasMemset = TLI->has(LibFunc::memset);
-  HasMemsetPattern = TLI->has(LibFunc::memset_pattern16);
-  HasMemcpy = TLI->has(LibFunc::memcpy);
+  HasMemset = TLI->has(LibFunc_memset);
+  HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
+  HasMemcpy = TLI->has(LibFunc_memcpy);
 
   if (HasMemset || HasMemsetPattern || HasMemcpy)
     if (SE->hasLoopInvariantBackedgeTakenCount(L))
@@ -823,7 +823,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
     Module *M = TheStore->getModule();
     Value *MSP =
         M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(),
-                               Int8PtrTy, Int8PtrTy, IntPtr, (void *)nullptr);
+                               Int8PtrTy, Int8PtrTy, IntPtr);
     inferLibFuncAttributes(*M->getFunction("memset_pattern16"), *TLI);
 
     // Otherwise we should form a memset_pattern16.  PatternValue is known to be
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 69102d10ff60..28e71ca05436 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -189,7 +189,9 @@ PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
   if (!SimplifyLoopInst(&L, &AR.DT, &AR.LI, &AR.AC, &AR.TLI))
     return PreservedAnalyses::all();
 
-  return getLoopPassPreservedAnalyses();
+  auto PA = getLoopPassPreservedAnalyses();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
 }
 
 char LoopInstSimplifyLegacyPass::ID = 0;
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index e9f84edd1cbf..9f3875a3027f 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -39,7 +39,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "loop-interchange"
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
index 8fb580183e30..cf63cb660db8 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -20,13 +20,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Scalar/LoopLoadElimination.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -45,9 +46,9 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/LoopVersioning.h"
-#include <forward_list>
-#include <cassert>
 #include <algorithm>
+#include <cassert>
+#include <forward_list>
 #include <set>
 #include <tuple>
 #include <utility>
@@ -373,15 +374,15 @@ public:
     const auto &AllChecks = LAI.getRuntimePointerChecking()->getChecks();
     SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
 
-    std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks),
-                 [&](const RuntimePointerChecking::PointerCheck &Check) {
-                   for (auto PtrIdx1 : Check.first->Members)
-                     for (auto PtrIdx2 : Check.second->Members)
-                       if (needsChecking(PtrIdx1, PtrIdx2,
-                                         PtrsWrittenOnFwdingPath, CandLoadPtrs))
-                         return true;
-                   return false;
-                 });
+    copy_if(AllChecks, std::back_inserter(Checks),
+            [&](const RuntimePointerChecking::PointerCheck &Check) {
+              for (auto PtrIdx1 : Check.first->Members)
+                for (auto PtrIdx2 : Check.second->Members)
+                  if (needsChecking(PtrIdx1, PtrIdx2, PtrsWrittenOnFwdingPath,
+                                    CandLoadPtrs))
+                    return true;
+              return false;
+            });
 
     DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size() << "):\n");
     DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
@@ -558,6 +559,32 @@ private:
   PredicatedScalarEvolution PSE;
 };
 
+static bool
+eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT,
+                          function_ref<const LoopAccessInfo &(Loop &)> GetLAI) {
+  // Build up a worklist of inner-loops to transform to avoid iterator
+  // invalidation.
+  // FIXME: This logic comes from other passes that actually change the loop
+  // nest structure. It isn't clear this is necessary (or useful) for a pass
+  // which merely optimizes the use of loads in a loop.
+  SmallVector<Loop *, 8> Worklist;
+
+  for (Loop *TopLevelLoop : LI)
+    for (Loop *L : depth_first(TopLevelLoop))
+      // We only handle inner-most loops.
+      if (L->empty())
+        Worklist.push_back(L);
+
+  // Now walk the identified inner loops.
+  bool Changed = false;
+  for (Loop *L : Worklist) {
+    // The actual work is performed by LoadEliminationForLoop.
+    LoadEliminationForLoop LEL(L, &LI, GetLAI(*L), &DT);
+    Changed |= LEL.processLoop();
+  }
+  return Changed;
+}
+
 /// \brief The pass.  Most of the work is delegated to the per-loop
 /// LoadEliminationForLoop class.
 class LoopLoadElimination : public FunctionPass {
@@ -570,32 +597,14 @@ public:
     if (skipFunction(F))
       return false;
 
-    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-    auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
-    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-
-    // Build up a worklist of inner-loops to vectorize. This is necessary as the
-    // act of distributing a loop creates new loops and can invalidate iterators
-    // across the loops.
-    SmallVector<Loop *, 8> Worklist;
-
-    for (Loop *TopLevelLoop : *LI)
-      for (Loop *L : depth_first(TopLevelLoop))
-        // We only handle inner-most loops.
-        if (L->empty())
-          Worklist.push_back(L);
-
-    // Now walk the identified inner loops.
-    bool Changed = false;
-    for (Loop *L : Worklist) {
-      const LoopAccessInfo &LAI = LAA->getInfo(L);
-      // The actual work is performed by LoadEliminationForLoop.
-      LoadEliminationForLoop LEL(L, LI, LAI, DT);
-      Changed |= LEL.processLoop();
-    }
+    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto &LAA = getAnalysis<LoopAccessLegacyAnalysis>();
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
     // Process each loop nest in the function.
-    return Changed;
+    return eliminateLoadsAcrossLoops(
+        F, LI, DT,
+        [&LAA](Loop &L) -> const LoopAccessInfo & { return LAA.getInfo(&L); });
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -631,4 +640,28 @@ FunctionPass *createLoopLoadEliminationPass() {
   return new LoopLoadElimination();
 }
 
+PreservedAnalyses LoopLoadEliminationPass::run(Function &F,
+                                               FunctionAnalysisManager &AM) {
+  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+
+  auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
+  bool Changed = eliminateLoadsAcrossLoops(
+      F, LI, DT, [&](Loop &L) -> const LoopAccessInfo & {
+        LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI};
+        return LAM.getResult<LoopAccessAnalysis>(L, AR);
+      });
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  return PA;
+}
+
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
index 028f4bba8b1d..10f6fcdcfdb7 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
@@ -42,6 +42,13 @@ PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &,
       break;
     }
 
+#ifndef NDEBUG
+    // Verify the loop structure and LCSSA form before visiting the loop.
+    L.verifyLoop();
+    assert(L.isRecursivelyLCSSAForm(AR.DT, AR.LI) &&
+           "Loops must remain in LCSSA form!");
+#endif
+
     // Update the analysis manager as each pass runs and potentially
     // invalidates analyses.
     AM.invalidate(L, PassPA);
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp
new file mode 100644
index 000000000000..0ce604429326
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp
@@ -0,0 +1,282 @@
+//===-- LoopPredication.cpp - Guard based loop predication pass -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The LoopPredication pass tries to convert loop variant range checks to loop
+// invariant by widening checks across loop iterations. For example, it will
+// convert
+//
+//   for (i = 0; i < n; i++) {
+//     guard(i < len);
+//     ...
+//   }
+//
+// to
+//
+//   for (i = 0; i < n; i++) {
+//     guard(n - 1 < len);
+//     ...
+//   }
+//
+// After this transformation the condition of the guard is loop invariant, so
+// loop-unswitch can later unswitch the loop by this condition which basically
+// predicates the loop by the widened condition:
+//
+//   if (n - 1 < len)
+//     for (i = 0; i < n; i++) {
+//       ...
+//     }
+//   else
+//     deoptimize
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopPredication.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+#define DEBUG_TYPE "loop-predication"
+
+using namespace llvm;
+
+namespace {
+class LoopPredication {
+  ScalarEvolution *SE;
+
+  Loop *L;
+  const DataLayout *DL;
+  BasicBlock *Preheader;
+
+  Optional<Value *> widenICmpRangeCheck(ICmpInst *ICI, SCEVExpander &Expander,
+                                        IRBuilder<> &Builder);
+  bool widenGuardConditions(IntrinsicInst *II, SCEVExpander &Expander);
+
+public:
+  LoopPredication(ScalarEvolution *SE) : SE(SE){};
+  bool runOnLoop(Loop *L);
+};
+
+class LoopPredicationLegacyPass : public LoopPass {
+public:
+  static char ID;
+  LoopPredicationLegacyPass() : LoopPass(ID) {
+    initializeLoopPredicationLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    getLoopAnalysisUsage(AU);
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
+      return false;
+    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    LoopPredication LP(SE);
+    return LP.runOnLoop(L);
+  }
+};
+
+char LoopPredicationLegacyPass::ID = 0;
+} // end namespace llvm
+
+INITIALIZE_PASS_BEGIN(LoopPredicationLegacyPass, "loop-predication",
+                      "Loop predication", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_END(LoopPredicationLegacyPass, "loop-predication",
+                    "Loop predication", false, false)
+
+Pass *llvm::createLoopPredicationPass() {
+  return new LoopPredicationLegacyPass();
+}
+
+PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM,
+                                           LoopStandardAnalysisResults &AR,
+                                           LPMUpdater &U) {
+  LoopPredication LP(&AR.SE);
+  if (!LP.runOnLoop(&L))
+    return PreservedAnalyses::all();
+
+  return getLoopPassPreservedAnalyses();
+}
+
+/// If ICI can be widened to a loop invariant condition emits the loop
+/// invariant condition in the loop preheader and return it, otherwise
+/// returns None.
+Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
+                                                       SCEVExpander &Expander,
+                                                       IRBuilder<> &Builder) {
+  DEBUG(dbgs() << "Analyzing ICmpInst condition:\n");
+  DEBUG(ICI->dump());
+
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+  Value *LHS = ICI->getOperand(0);
+  Value *RHS = ICI->getOperand(1);
+  const SCEV *LHSS = SE->getSCEV(LHS);
+  if (isa<SCEVCouldNotCompute>(LHSS))
+    return None;
+  const SCEV *RHSS = SE->getSCEV(RHS);
+  if (isa<SCEVCouldNotCompute>(RHSS))
+    return None;
+
+  // Canonicalize RHS to be loop invariant bound, LHS - a loop computable index
+  if (SE->isLoopInvariant(LHSS, L)) {
+    std::swap(LHS, RHS);
+    std::swap(LHSS, RHSS);
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+  if (!SE->isLoopInvariant(RHSS, L) || !isSafeToExpand(RHSS, *SE))
+    return None;
+
+  const SCEVAddRecExpr *IndexAR = dyn_cast<SCEVAddRecExpr>(LHSS);
+  if (!IndexAR || IndexAR->getLoop() != L)
+    return None;
+
+  DEBUG(dbgs() << "IndexAR: ");
+  DEBUG(IndexAR->dump());
+
+  bool IsIncreasing = false;
+  if (!SE->isMonotonicPredicate(IndexAR, Pred, IsIncreasing))
+    return None;
+
+  // If the predicate is increasing the condition can change from false to true
+  // as the loop progresses, in this case take the value on the first iteration
+  // for the widened check. Otherwise the condition can change from true to
+  // false as the loop progresses, so take the value on the last iteration.
+  const SCEV *NewLHSS = IsIncreasing
+                            ? IndexAR->getStart()
+                            : SE->getSCEVAtScope(IndexAR, L->getParentLoop());
+  if (NewLHSS == IndexAR) {
+    DEBUG(dbgs() << "Can't compute NewLHSS!\n");
+    return None;
+  }
+
+  DEBUG(dbgs() << "NewLHSS: ");
+  DEBUG(NewLHSS->dump());
+
+  if (!SE->isLoopInvariant(NewLHSS, L) || !isSafeToExpand(NewLHSS, *SE))
+    return None;
+
+  DEBUG(dbgs() << "NewLHSS is loop invariant and safe to expand. Expand!\n");
+
+  Type *Ty = LHS->getType();
+  Instruction *InsertAt = Preheader->getTerminator();
+  assert(Ty == RHS->getType() && "icmp operands have different types?");
+  Value *NewLHS = Expander.expandCodeFor(NewLHSS, Ty, InsertAt);
+  Value *NewRHS = Expander.expandCodeFor(RHSS, Ty, InsertAt);
+  return Builder.CreateICmp(Pred, NewLHS, NewRHS);
+}
+
+bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
+                                           SCEVExpander &Expander) {
+  DEBUG(dbgs() << "Processing guard:\n");
+  DEBUG(Guard->dump());
+
+  IRBuilder<> Builder(cast<Instruction>(Preheader->getTerminator()));
+
+  // The guard condition is expected to be in form of:
+  //   cond1 && cond2 && cond3 ...
+  // Iterate over subconditions looking for for icmp conditions which can be
+  // widened across loop iterations. Widening these conditions remember the
+  // resulting list of subconditions in Checks vector.
+  SmallVector<Value *, 4> Worklist(1, Guard->getOperand(0));
+  SmallPtrSet<Value *, 4> Visited;
+
+  SmallVector<Value *, 4> Checks;
+
+  unsigned NumWidened = 0;
+  do {
+    Value *Condition = Worklist.pop_back_val();
+    if (!Visited.insert(Condition).second)
+      continue;
+
+    Value *LHS, *RHS;
+    using namespace llvm::PatternMatch;
+    if (match(Condition, m_And(m_Value(LHS), m_Value(RHS)))) {
+      Worklist.push_back(LHS);
+      Worklist.push_back(RHS);
+      continue;
+    }
+
+    if (ICmpInst *ICI = dyn_cast<ICmpInst>(Condition)) {
+      if (auto NewRangeCheck = widenICmpRangeCheck(ICI, Expander, Builder)) {
+        Checks.push_back(NewRangeCheck.getValue());
+        NumWidened++;
+        continue;
+      }
+    }
+
+    // Save the condition as is if we can't widen it
+    Checks.push_back(Condition);
+  } while (Worklist.size() != 0);
+
+  if (NumWidened == 0)
+    return false;
+
+  // Emit the new guard condition
+  Builder.SetInsertPoint(Guard);
+  Value *LastCheck = nullptr;
+  for (auto *Check : Checks)
+    if (!LastCheck)
+      LastCheck = Check;
+    else
+      LastCheck = Builder.CreateAnd(LastCheck, Check);
+  Guard->setOperand(0, LastCheck);
+
+  DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n");
+  return true;
+}
+
+bool LoopPredication::runOnLoop(Loop *Loop) {
+  L = Loop;
+
+  DEBUG(dbgs() << "Analyzing ");
+  DEBUG(L->dump());
+
+  Module *M = L->getHeader()->getModule();
+
+  // There is nothing to do if the module doesn't use guards
+  auto *GuardDecl =
+      M->getFunction(Intrinsic::getName(Intrinsic::experimental_guard));
+  if (!GuardDecl || GuardDecl->use_empty())
+    return false;
+
+  DL = &M->getDataLayout();
+
+  Preheader = L->getLoopPreheader();
+  if (!Preheader)
+    return false;
+
+  // Collect all the guards into a vector and process later, so as not
+  // to invalidate the instruction iterator.
+  SmallVector<IntrinsicInst *, 4> Guards;
+  for (const auto BB : L->blocks())
+    for (auto &I : *BB)
+      if (auto *II = dyn_cast<IntrinsicInst>(&I))
+        if (II->getIntrinsicID() == Intrinsic::experimental_guard)
+          Guards.push_back(II);
+
+  SCEVExpander Expander(*SE, *DL, "loop-predication");
+
+  bool Changed = false;
+  for (auto *Guard : Guards)
+    Changed |= widenGuardConditions(Guard, Expander);
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
index cc83069d5f52..e5689368de80 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
@@ -79,7 +79,8 @@ private:
 /// to merge the two values.  Do this now.
 static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
                                             BasicBlock *OrigPreheader,
-                                            ValueToValueMapTy &ValueMap) {
+                                            ValueToValueMapTy &ValueMap,
+                                SmallVectorImpl<PHINode*> *InsertedPHIs) {
   // Remove PHI node entries that are no longer live.
   BasicBlock::iterator I, E = OrigHeader->end();
   for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I)
@@ -87,7 +88,7 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
 
   // Now fix up users of the instructions in OrigHeader, inserting PHI nodes
   // as necessary.
-  SSAUpdater SSA;
+  SSAUpdater SSA(InsertedPHIs);
   for (I = OrigHeader->begin(); I != E; ++I) {
     Value *OrigHeaderVal = &*I;
 
@@ -174,6 +175,38 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
   }
 }
 
+/// Propagate dbg.value intrinsics through the newly inserted Phis.
+static void insertDebugValues(BasicBlock *OrigHeader,
+                              SmallVectorImpl<PHINode*> &InsertedPHIs) {
+  ValueToValueMapTy DbgValueMap;
+
+  // Map existing PHI nodes to their dbg.values.
+  for (auto &I : *OrigHeader) {
+    if (auto DbgII = dyn_cast<DbgInfoIntrinsic>(&I)) {
+      if (auto *Loc = dyn_cast_or_null<PHINode>(DbgII->getVariableLocation()))
+        DbgValueMap.insert({Loc, DbgII});
+    }
+  }
+
+  // Then iterate through the new PHIs and look to see if they use one of the
+  // previously mapped PHIs. If so, insert a new dbg.value intrinsic that will
+  // propagate the info through the new PHI.
+  LLVMContext &C = OrigHeader->getContext();
+  for (auto PHI : InsertedPHIs) {
+    for (auto VI : PHI->operand_values()) {
+      auto V = DbgValueMap.find(VI);
+      if (V != DbgValueMap.end()) {
+        auto *DbgII = cast<DbgInfoIntrinsic>(V->second);
+        Instruction *NewDbgII = DbgII->clone();
+        auto PhiMAV = MetadataAsValue::get(C, ValueAsMetadata::get(PHI));
+        NewDbgII->setOperand(0, PhiMAV);
+        BasicBlock *Parent = PHI->getParent();
+        NewDbgII->insertBefore(Parent->getFirstNonPHIOrDbgOrLifetime());
+      }
+    }
+  }
+}
+
 /// Rotate loop LP. Return true if the loop is rotated.
 ///
 /// \param SimplifiedLatch is true if the latch was just folded into the final
@@ -347,9 +380,18 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
   // remove the corresponding incoming values from the PHI nodes in OrigHeader.
   LoopEntryBranch->eraseFromParent();
 
+
+  SmallVector<PHINode*, 2> InsertedPHIs;
   // If there were any uses of instructions in the duplicated block outside the
   // loop, update them, inserting PHI nodes as required
-  RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap);
+  RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap,
+                                  &InsertedPHIs);
+
+  // Attach dbg.value intrinsics to the new phis if that phi uses a value that
+  // previously had debug metadata attached. This keeps the debug info
+  // up-to-date in the loop body.
+  if (!InsertedPHIs.empty())
+    insertDebugValues(OrigHeader, InsertedPHIs);
 
   // NewHeader is now the header of the loop.
   L->moveToHeader(NewHeader);
@@ -634,6 +676,7 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
   bool Changed = LR.processLoop(&L);
   if (!Changed)
     return PreservedAnalyses::all();
+
   return getLoopPassPreservedAnalyses();
 }
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index 16061212ba38..a5a81c33a8eb 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -69,6 +69,7 @@ PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM,
                                            LPMUpdater &) {
   if (!simplifyLoopCFG(L, AR.DT, AR.LI))
     return PreservedAnalyses::all();
+
   return getLoopPassPreservedAnalyses();
 }
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp
index f3f415275c0e..c9d55b4594fe 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp
@@ -1,4 +1,4 @@
-//===-- LoopSink.cpp - Loop Sink Pass ------------------------===//
+//===-- LoopSink.cpp - Loop Sink Pass -------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -28,8 +28,10 @@
 //       InsertBBs = UseBBs - DomBBs + BB
 //   For BB in InsertBBs:
 //     Insert I at BB's beginning
+//
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Scalar/LoopSink.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
@@ -297,6 +299,42 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
   return Changed;
 }
 
+PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) {
+  LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
+  // Nothing to do if there are no loops.
+  if (LI.empty())
+    return PreservedAnalyses::all();
+
+  AAResults &AA = FAM.getResult<AAManager>(F);
+  DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+  BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+
+  // We want to do a postorder walk over the loops. Since loops are a tree this
+  // is equivalent to a reversed preorder walk and preorder is easy to compute
+  // without recursion. Since we reverse the preorder, we will visit siblings
+  // in reverse program order. This isn't expected to matter at all but is more
+  // consistent with sinking algorithms which generally work bottom-up.
+  SmallVector<Loop *, 4> PreorderLoops = LI.getLoopsInPreorder();
+
+  bool Changed = false;
+  do {
+    Loop &L = *PreorderLoops.pop_back_val();
+
+    // Note that we don't pass SCEV here because it is only used to invalidate
+    // loops in SCEV and we don't preserve (or request) SCEV at all making that
+    // unnecessary.
+    Changed |= sinkLoopInvariantInstructions(L, AA, LI, DT, BFI,
+                                             /*ScalarEvolution*/ nullptr);
+  } while (!PreorderLoops.empty());
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
 namespace {
 struct LegacyLoopSinkPass : public LoopPass {
   static char ID;
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 194587a85e7c..af137f6faa63 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -129,6 +129,17 @@ static cl::opt<bool> EnablePhiElim(
   "enable-lsr-phielim", cl::Hidden, cl::init(true),
   cl::desc("Enable LSR phi elimination"));
 
+// The flag adds instruction count to solutions cost comparision.
+static cl::opt<bool> InsnsCost(
+  "lsr-insns-cost", cl::Hidden, cl::init(false),
+  cl::desc("Add instruction count to a LSR cost model"));
+
+// Flag to choose how to narrow complex lsr solution
+static cl::opt<bool> LSRExpNarrow(
+  "lsr-exp-narrow", cl::Hidden, cl::init(false),
+  cl::desc("Narrow LSR complex solution using"
+           " expectation of registers number"));
+
 #ifndef NDEBUG
 // Stress test IV chain generation.
 static cl::opt<bool> StressIVChain(
@@ -181,10 +192,11 @@ void RegSortData::print(raw_ostream &OS) const {
   OS << "[NumUses=" << UsedByIndices.count() << ']';
 }
 
-LLVM_DUMP_METHOD
-void RegSortData::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void RegSortData::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 namespace {
 
@@ -295,9 +307,13 @@ struct Formula {
   /// canonical representation of a formula is
   /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
   /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
+  /// 3. The reg containing recurrent expr related with currect loop in the
+  /// formula should be put in the ScaledReg.
   /// #1 enforces that the scaled register is always used when at least two
   /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
   /// #2 enforces that 1 * reg is reg.
+  /// #3 ensures invariant regs with respect to current loop can be combined
+  /// together in LSR codegen.
   /// This invariant can be temporarly broken while building a formula.
   /// However, every formula inserted into the LSRInstance must be in canonical
   /// form.
@@ -318,12 +334,14 @@ struct Formula {
 
   void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
 
-  bool isCanonical() const;
+  bool isCanonical(const Loop &L) const;
 
-  void canonicalize();
+  void canonicalize(const Loop &L);
 
   bool unscale();
 
+  bool hasZeroEnd() const;
+
   size_t getNumRegs() const;
   Type *getType() const;
 
@@ -410,16 +428,35 @@ void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
       BaseRegs.push_back(Sum);
     HasBaseReg = true;
   }
-  canonicalize();
+  canonicalize(*L);
 }
 
 /// \brief Check whether or not this formula statisfies the canonical
 /// representation.
 /// \see Formula::BaseRegs.
-bool Formula::isCanonical() const {
-  if (ScaledReg)
-    return Scale != 1 || !BaseRegs.empty();
-  return BaseRegs.size() <= 1;
+bool Formula::isCanonical(const Loop &L) const {
+  if (!ScaledReg)
+    return BaseRegs.size() <= 1;
+
+  if (Scale != 1)
+    return true;
+
+  if (Scale == 1 && BaseRegs.empty())
+    return false;
+
+  const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
+  if (SAR && SAR->getLoop() == &L)
+    return true;
+
+  // If ScaledReg is not a recurrent expr, or it is but its loop is not current
+  // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
+  // loop, we want to swap the reg in BaseRegs with ScaledReg.
+  auto I =
+      find_if(make_range(BaseRegs.begin(), BaseRegs.end()), [&](const SCEV *S) {
+        return isa<const SCEVAddRecExpr>(S) &&
+               (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
+      });
+  return I == BaseRegs.end();
 }
 
 /// \brief Helper method to morph a formula into its canonical representation.
@@ -428,21 +465,33 @@ bool Formula::isCanonical() const {
 /// field. Otherwise, we would have to do special cases everywhere in LSR
 /// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
 /// On the other hand, 1*reg should be canonicalized into reg.
-void Formula::canonicalize() {
-  if (isCanonical())
+void Formula::canonicalize(const Loop &L) {
+  if (isCanonical(L))
     return;
   // So far we did not need this case. This is easy to implement but it is
   // useless to maintain dead code. Beside it could hurt compile time.
   assert(!BaseRegs.empty() && "1*reg => reg, should not be needed.");
+
   // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
-  ScaledReg = BaseRegs.back();
-  BaseRegs.pop_back();
-  Scale = 1;
-  size_t BaseRegsSize = BaseRegs.size();
-  size_t Try = 0;
-  // If ScaledReg is an invariant, try to find a variant expression.
-  while (Try < BaseRegsSize && !isa<SCEVAddRecExpr>(ScaledReg))
-    std::swap(ScaledReg, BaseRegs[Try++]);
+  if (!ScaledReg) {
+    ScaledReg = BaseRegs.back();
+    BaseRegs.pop_back();
+    Scale = 1;
+  }
+
+  // If ScaledReg is an invariant with respect to L, find the reg from
+  // BaseRegs containing the recurrent expr related with Loop L. Swap the
+  // reg with ScaledReg.
+  const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
+  if (!SAR || SAR->getLoop() != &L) {
+    auto I = find_if(make_range(BaseRegs.begin(), BaseRegs.end()),
+                     [&](const SCEV *S) {
+                       return isa<const SCEVAddRecExpr>(S) &&
+                              (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
+                     });
+    if (I != BaseRegs.end())
+      std::swap(ScaledReg, *I);
+  }
 }
 
 /// \brief Get rid of the scale in the formula.
@@ -458,6 +507,14 @@ bool Formula::unscale() {
   return true;
 }
 
+bool Formula::hasZeroEnd() const {
+  if (UnfoldedOffset || BaseOffset)
+    return false;
+  if (BaseRegs.size() != 1 || ScaledReg)
+    return false;
+  return true;
+}
+
 /// Return the total number of register operands used by this formula. This does
 /// not include register uses implied by non-constant addrec strides.
 size_t Formula::getNumRegs() const {
@@ -534,10 +591,11 @@ void Formula::print(raw_ostream &OS) const {
   }
 }
 
-LLVM_DUMP_METHOD
-void Formula::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void Formula::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 /// Return true if the given addrec can be sign-extended without changing its
 /// value.
@@ -711,7 +769,7 @@ static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
 static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
   bool isAddress = isa<LoadInst>(Inst);
   if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-    if (SI->getOperand(1) == OperandVal)
+    if (SI->getPointerOperand() == OperandVal)
       isAddress = true;
   } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
     // Addressing modes can also be folded into prefetches and a variety
@@ -723,6 +781,12 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
           isAddress = true;
         break;
     }
+  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
+    if (RMW->getPointerOperand() == OperandVal)
+      isAddress = true;
+  } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+    if (CmpX->getPointerOperand() == OperandVal)
+      isAddress = true;
   }
   return isAddress;
 }
@@ -735,6 +799,10 @@ static MemAccessTy getAccessType(const Instruction *Inst) {
     AccessTy.AddrSpace = SI->getPointerAddressSpace();
   } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
     AccessTy.AddrSpace = LI->getPointerAddressSpace();
+  } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
+    AccessTy.AddrSpace = RMW->getPointerAddressSpace();
+  } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+    AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
   }
 
   // All pointers have the same requirements, so canonicalize them to an
@@ -875,7 +943,8 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  const LSRUse &LU, const Formula &F);
 // Get the cost of the scaling factor used in F for LU.
 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
-                                     const LSRUse &LU, const Formula &F);
+                                     const LSRUse &LU, const Formula &F,
+                                     const Loop &L);
 
 namespace {
 
@@ -883,6 +952,7 @@ namespace {
 class Cost {
   /// TODO: Some of these could be merged. Also, a lexical ordering
   /// isn't always optimal.
+  unsigned Insns;
   unsigned NumRegs;
   unsigned AddRecCost;
   unsigned NumIVMuls;
@@ -893,8 +963,8 @@ class Cost {
 
 public:
   Cost()
-    : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
-      SetupCost(0), ScaleCost(0) {}
+    : Insns(0), NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0),
+      ImmCost(0), SetupCost(0), ScaleCost(0) {}
 
   bool operator<(const Cost &Other) const;
 
@@ -903,9 +973,9 @@ public:
 #ifndef NDEBUG
   // Once any of the metrics loses, they must all remain losers.
   bool isValid() {
-    return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
+    return ((Insns | NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
              | ImmCost | SetupCost | ScaleCost) != ~0u)
-      || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
+      || ((Insns & NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
            & ImmCost & SetupCost & ScaleCost) == ~0u);
   }
 #endif
@@ -1067,7 +1137,8 @@ public:
   }
   
   bool HasFormulaWithSameRegs(const Formula &F) const;
-  bool InsertFormula(const Formula &F);
+  float getNotSelectedProbability(const SCEV *Reg) const;
+  bool InsertFormula(const Formula &F, const Loop &L);
   void DeleteFormula(Formula &F);
   void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
 
@@ -1083,17 +1154,23 @@ void Cost::RateRegister(const SCEV *Reg,
                         const Loop *L,
                         ScalarEvolution &SE, DominatorTree &DT) {
   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
-    // If this is an addrec for another loop, don't second-guess its addrec phi
-    // nodes. LSR isn't currently smart enough to reason about more than one
-    // loop at a time. LSR has already run on inner loops, will not run on outer
-    // loops, and cannot be expected to change sibling loops.
+    // If this is an addrec for another loop, it should be an invariant
+    // with respect to L since L is the innermost loop (at least
+    // for now LSR only handles innermost loops).
     if (AR->getLoop() != L) {
       // If the AddRec exists, consider it's register free and leave it alone.
       if (isExistingPhi(AR, SE))
         return;
 
-      // Otherwise, do not consider this formula at all.
-      Lose();
+      // It is bad to allow LSR for current loop to add induction variables
+      // for its sibling loops.
+      if (!AR->getLoop()->contains(L)) {
+        Lose();
+        return;
+      }
+
+      // Otherwise, it will be an invariant with respect to Loop L.
+      ++NumRegs;
       return;
     }
     AddRecCost += 1; /// TODO: This should be a function of the stride.
@@ -1150,8 +1227,11 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
                        ScalarEvolution &SE, DominatorTree &DT,
                        const LSRUse &LU,
                        SmallPtrSetImpl<const SCEV *> *LoserRegs) {
-  assert(F.isCanonical() && "Cost is accurate only for canonical formula");
+  assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
   // Tally up the registers.
+  unsigned PrevAddRecCost = AddRecCost;
+  unsigned PrevNumRegs = NumRegs;
+  unsigned PrevNumBaseAdds = NumBaseAdds;
   if (const SCEV *ScaledReg = F.ScaledReg) {
     if (VisitedRegs.count(ScaledReg)) {
       Lose();
@@ -1171,6 +1251,18 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
       return;
   }
 
+  // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
+  // additional instruction (at least fill).
+  unsigned TTIRegNum = TTI.getNumberOfRegisters(false) - 1;
+  if (NumRegs > TTIRegNum) {
+    // Cost already exceeded TTIRegNum, then only newly added register can add
+    // new instructions.
+    if (PrevNumRegs > TTIRegNum)
+      Insns += (NumRegs - PrevNumRegs);
+    else
+      Insns += (NumRegs - TTIRegNum);
+  }
+
   // Determine how many (unfolded) adds we'll need inside the loop.
   size_t NumBaseParts = F.getNumRegs();
   if (NumBaseParts > 1)
@@ -1181,7 +1273,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
   NumBaseAdds += (F.UnfoldedOffset != 0);
 
   // Accumulate non-free scaling amounts.
-  ScaleCost += getScalingFactorCost(TTI, LU, F);
+  ScaleCost += getScalingFactorCost(TTI, LU, F, *L);
 
   // Tally up the non-zero immediates.
   for (const LSRFixup &Fixup : LU.Fixups) {
@@ -1199,11 +1291,30 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
         !TTI.isFoldableMemAccessOffset(Fixup.UserInst, Offset))
       NumBaseAdds++;
   }
+
+  // If ICmpZero formula ends with not 0, it could not be replaced by
+  // just add or sub. We'll need to compare final result of AddRec.
+  // That means we'll need an additional instruction.
+  // For -10 + {0, +, 1}:
+  // i = i + 1;
+  // cmp i, 10
+  //
+  // For {-10, +, 1}:
+  // i = i + 1;
+  if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd())
+    Insns++;
+  // Each new AddRec adds 1 instruction to calculation.
+  Insns += (AddRecCost - PrevAddRecCost);
+
+  // BaseAdds adds instructions for unfolded registers.
+  if (LU.Kind != LSRUse::ICmpZero)
+    Insns += NumBaseAdds - PrevNumBaseAdds;
   assert(isValid() && "invalid cost");
 }
 
 /// Set this cost to a losing value.
 void Cost::Lose() {
+  Insns = ~0u;
   NumRegs = ~0u;
   AddRecCost = ~0u;
   NumIVMuls = ~0u;
@@ -1215,6 +1326,8 @@ void Cost::Lose() {
 
 /// Choose the lower cost.
 bool Cost::operator<(const Cost &Other) const {
+  if (InsnsCost && Insns != Other.Insns)
+    return Insns < Other.Insns;
   return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost,
                   ImmCost, SetupCost) <
          std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls,
@@ -1223,6 +1336,7 @@ bool Cost::operator<(const Cost &Other) const {
 }
 
 void Cost::print(raw_ostream &OS) const {
+  OS << Insns << " instruction" << (Insns == 1 ? " " : "s ");
   OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
   if (AddRecCost != 0)
     OS << ", with addrec cost " << AddRecCost;
@@ -1239,10 +1353,11 @@ void Cost::print(raw_ostream &OS) const {
     OS << ", plus " << SetupCost << " setup cost";
 }
 
-LLVM_DUMP_METHOD
-void Cost::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void Cost::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 LSRFixup::LSRFixup()
   : UserInst(nullptr), OperandValToReplace(nullptr),
@@ -1285,10 +1400,11 @@ void LSRFixup::print(raw_ostream &OS) const {
     OS << ", Offset=" << Offset;
 }
 
-LLVM_DUMP_METHOD
-void LSRFixup::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LSRFixup::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 /// Test whether this use as a formula which has the same registers as the given
 /// formula.
@@ -1300,10 +1416,19 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
   return Uniquifier.count(Key);
 }
 
+/// The function returns a probability of selecting formula without Reg.
+float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
+  unsigned FNum = 0;
+  for (const Formula &F : Formulae)
+    if (F.referencesReg(Reg))
+      FNum++;
+  return ((float)(Formulae.size() - FNum)) / Formulae.size();
+}
+
 /// If the given formula has not yet been inserted, add it to the list, and
 /// return true. Return false otherwise.  The formula must be in canonical form.
-bool LSRUse::InsertFormula(const Formula &F) {
-  assert(F.isCanonical() && "Invalid canonical representation");
+bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
+  assert(F.isCanonical(L) && "Invalid canonical representation");
 
   if (!Formulae.empty() && RigidFormula)
     return false;
@@ -1391,10 +1516,11 @@ void LSRUse::print(raw_ostream &OS) const {
     OS << ", widest fixup type: " << *WidestFixupType;
 }
 
-LLVM_DUMP_METHOD
-void LSRUse::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LSRUse::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  LSRUse::KindType Kind, MemAccessTy AccessTy,
@@ -1472,7 +1598,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  int64_t MinOffset, int64_t MaxOffset,
                                  LSRUse::KindType Kind, MemAccessTy AccessTy,
-                                 const Formula &F) {
+                                 const Formula &F, const Loop &L) {
   // For the purpose of isAMCompletelyFolded either having a canonical formula
   // or a scale not equal to zero is correct.
   // Problems may arise from non canonical formulae having a scale == 0.
@@ -1480,7 +1606,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
   // However, when we generate the scaled formulae, we first check that the
   // scaling factor is profitable before computing the actual ScaledReg for
   // compile time sake.
-  assert((F.isCanonical() || F.Scale != 0));
+  assert((F.isCanonical(L) || F.Scale != 0));
   return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
                               F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
 }
@@ -1515,14 +1641,15 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
 }
 
 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
-                                     const LSRUse &LU, const Formula &F) {
+                                     const LSRUse &LU, const Formula &F,
+                                     const Loop &L) {
   if (!F.Scale)
     return 0;
 
   // If the use is not completely folded in that instruction, we will have to
   // pay an extra cost only for scale != 1.
   if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
-                            LU.AccessTy, F))
+                            LU.AccessTy, F, L))
     return F.Scale != 1;
 
   switch (LU.Kind) {
@@ -1772,6 +1899,7 @@ class LSRInstance {
   void NarrowSearchSpaceByDetectingSupersets();
   void NarrowSearchSpaceByCollapsingUnrolledCode();
   void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
+  void NarrowSearchSpaceByDeletingCostlyFormulas();
   void NarrowSearchSpaceByPickingWinnerRegs();
   void NarrowSearchSpaceUsingHeuristics();
 
@@ -2492,7 +2620,12 @@ static Value *getWideOperand(Value *Oper) {
 static bool isCompatibleIVType(Value *LVal, Value *RVal) {
   Type *LType = LVal->getType();
   Type *RType = RVal->getType();
-  return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy());
+  return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy() &&
+                              // Different address spaces means (possibly)
+                              // different types of the pointer implementation,
+                              // e.g. i16 vs i32 so disallow that.
+                              (LType->getPointerAddressSpace() ==
+                               RType->getPointerAddressSpace()));
 }
 
 /// Return an approximation of this SCEV expression's "base", or NULL for any
@@ -2989,8 +3122,10 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
     User::op_iterator UseI =
         find(UserInst->operands(), U.getOperandValToReplace());
     assert(UseI != UserInst->op_end() && "cannot find IV operand");
-    if (IVIncSet.count(UseI))
+    if (IVIncSet.count(UseI)) {
+      DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
       continue;
+    }
 
     LSRUse::KindType Kind = LSRUse::Basic;
     MemAccessTy AccessTy;
@@ -3025,8 +3160,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
         if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) {
           // S is normalized, so normalize N before folding it into S
           // to keep the result normalized.
-          N = TransformForPostIncUse(Normalize, N, CI, nullptr,
-                                     TmpPostIncLoops, SE, DT);
+          N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
           Kind = LSRUse::ICmpZero;
           S = SE.getMinusSCEV(N, S);
         }
@@ -3108,7 +3242,8 @@ bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
   // Do not insert formula that we will not be able to expand.
   assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
          "Formula is illegal");
-  if (!LU.InsertFormula(F))
+
+  if (!LU.InsertFormula(F, *L))
     return false;
 
   CountRegisters(F, LUIdx);
@@ -3347,7 +3482,7 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
       F.BaseRegs.push_back(*J);
     // We may have changed the number of register in base regs, adjust the
     // formula accordingly.
-    F.canonicalize();
+    F.canonicalize(*L);
 
     if (InsertFormula(LU, LUIdx, F))
       // If that formula hadn't been seen before, recurse to find more like
@@ -3359,7 +3494,7 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
 /// Split out subexpressions from adds and the bases of addrecs.
 void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
                                          Formula Base, unsigned Depth) {
-  assert(Base.isCanonical() && "Input must be in the canonical form");
+  assert(Base.isCanonical(*L) && "Input must be in the canonical form");
   // Arbitrarily cap recursion to protect compile time.
   if (Depth >= 3)
     return;
@@ -3400,7 +3535,7 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
     // rather than proceed with zero in a register.
     if (!Sum->isZero()) {
       F.BaseRegs.push_back(Sum);
-      F.canonicalize();
+      F.canonicalize(*L);
       (void)InsertFormula(LU, LUIdx, F);
     }
   }
@@ -3457,7 +3592,7 @@ void LSRInstance::GenerateConstantOffsetsImpl(
           F.ScaledReg = nullptr;
         } else
           F.deleteBaseReg(F.BaseRegs[Idx]);
-        F.canonicalize();
+        F.canonicalize(*L);
       } else if (IsScaledReg)
         F.ScaledReg = NewG;
       else
@@ -3620,10 +3755,10 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
     if (LU.Kind == LSRUse::ICmpZero &&
         !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV)
       continue;
-    // For each addrec base reg, apply the scale, if possible.
-    for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
-      if (const SCEVAddRecExpr *AR =
-            dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
+    // For each addrec base reg, if its loop is current loop, apply the scale.
+    for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
+      if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
         const SCEV *FactorS = SE.getConstant(IntTy, Factor);
         if (FactorS->isZero())
           continue;
@@ -3637,11 +3772,17 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
           // The canonical representation of 1*reg is reg, which is already in
           // Base. In that case, do not try to insert the formula, it will be
           // rejected anyway.
-          if (F.Scale == 1 && F.BaseRegs.empty())
+          if (F.Scale == 1 && (F.BaseRegs.empty() ||
+                               (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
             continue;
+          // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
+          // non canonical Formula with ScaledReg's loop not being L.
+          if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
+            F.canonicalize(*L);
           (void)InsertFormula(LU, LUIdx, F);
         }
       }
+    }
   }
 }
 
@@ -3697,10 +3838,11 @@ void WorkItem::print(raw_ostream &OS) const {
      << " , add offset " << Imm;
 }
 
-LLVM_DUMP_METHOD
-void WorkItem::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void WorkItem::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 /// Look for registers which are a constant distance apart and try to form reuse
 /// opportunities between them.
@@ -3821,7 +3963,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
             continue;
 
         // OK, looks good.
-        NewF.canonicalize();
+        NewF.canonicalize(*this->L);
         (void)InsertFormula(LU, LUIdx, NewF);
       } else {
         // Use the immediate in a base register.
@@ -3853,7 +3995,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
                 goto skip_formula;
 
           // Ok, looks good.
-          NewF.canonicalize();
+          NewF.canonicalize(*this->L);
           (void)InsertFormula(LU, LUIdx, NewF);
           break;
         skip_formula:;
@@ -4165,6 +4307,144 @@ void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
   }
 }
 
+/// The function delete formulas with high registers number expectation.
+/// Assuming we don't know the value of each formula (already delete
+/// all inefficient), generate probability of not selecting for each
+/// register.
+/// For example,
+/// Use1:
+///  reg(a) + reg({0,+,1})
+///  reg(a) + reg({-1,+,1}) + 1
+///  reg({a,+,1})
+/// Use2:
+///  reg(b) + reg({0,+,1})
+///  reg(b) + reg({-1,+,1}) + 1
+///  reg({b,+,1})
+/// Use3:
+///  reg(c) + reg(b) + reg({0,+,1})
+///  reg(c) + reg({b,+,1})
+///
+/// Probability of not selecting
+///                 Use1   Use2    Use3
+/// reg(a)         (1/3) *   1   *   1
+/// reg(b)           1   * (1/3) * (1/2)
+/// reg({0,+,1})   (2/3) * (2/3) * (1/2)
+/// reg({-1,+,1})  (2/3) * (2/3) *   1
+/// reg({a,+,1})   (2/3) *   1   *   1
+/// reg({b,+,1})     1   * (2/3) * (2/3)
+/// reg(c)           1   *   1   *   0
+///
+/// Now count registers number mathematical expectation for each formula:
+/// Note that for each use we exclude probability if not selecting for the use.
+/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
+/// probabilty 1/3 of not selecting for Use1).
+/// Use1:
+///  reg(a) + reg({0,+,1})          1 + 1/3       -- to be deleted
+///  reg(a) + reg({-1,+,1}) + 1     1 + 4/9       -- to be deleted
+///  reg({a,+,1})                   1
+/// Use2:
+///  reg(b) + reg({0,+,1})          1/2 + 1/3     -- to be deleted
+///  reg(b) + reg({-1,+,1}) + 1     1/2 + 2/3     -- to be deleted
+///  reg({b,+,1})                   2/3
+/// Use3:
+///  reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
+///  reg(c) + reg({b,+,1})          1 + 2/3
+
+void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
+  if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+    return;
+  // Ok, we have too many of formulae on our hands to conveniently handle.
+  // Use a rough heuristic to thin out the list.
+
+  // Set of Regs wich will be 100% used in final solution.
+  // Used in each formula of a solution (in example above this is reg(c)).
+  // We can skip them in calculations.
+  SmallPtrSet<const SCEV *, 4> UniqRegs;
+  DEBUG(dbgs() << "The search space is too complex.\n");
+
+  // Map each register to probability of not selecting
+  DenseMap <const SCEV *, float> RegNumMap;
+  for (const SCEV *Reg : RegUses) {
+    if (UniqRegs.count(Reg))
+      continue;
+    float PNotSel = 1;
+    for (const LSRUse &LU : Uses) {
+      if (!LU.Regs.count(Reg))
+        continue;
+      float P = LU.getNotSelectedProbability(Reg);
+      if (P != 0.0)
+        PNotSel *= P;
+      else
+        UniqRegs.insert(Reg);
+    }
+    RegNumMap.insert(std::make_pair(Reg, PNotSel));
+  }
+
+  DEBUG(dbgs() << "Narrowing the search space by deleting costly formulas\n");
+
+  // Delete formulas where registers number expectation is high.
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    // If nothing to delete - continue.
+    if (LU.Formulae.size() < 2)
+      continue;
+    // This is temporary solution to test performance. Float should be
+    // replaced with round independent type (based on integers) to avoid
+    // different results for different target builds.
+    float FMinRegNum = LU.Formulae[0].getNumRegs();
+    float FMinARegNum = LU.Formulae[0].getNumRegs();
+    size_t MinIdx = 0;
+    for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
+      Formula &F = LU.Formulae[i];
+      float FRegNum = 0;
+      float FARegNum = 0;
+      for (const SCEV *BaseReg : F.BaseRegs) {
+        if (UniqRegs.count(BaseReg))
+          continue;
+        FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
+        if (isa<SCEVAddRecExpr>(BaseReg))
+          FARegNum +=
+              RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
+      }
+      if (const SCEV *ScaledReg = F.ScaledReg) {
+        if (!UniqRegs.count(ScaledReg)) {
+          FRegNum +=
+              RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
+          if (isa<SCEVAddRecExpr>(ScaledReg))
+            FARegNum +=
+                RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
+        }
+      }
+      if (FMinRegNum > FRegNum ||
+          (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
+        FMinRegNum = FRegNum;
+        FMinARegNum = FARegNum;
+        MinIdx = i;
+      }
+    }
+    DEBUG(dbgs() << "  The formula "; LU.Formulae[MinIdx].print(dbgs());
+          dbgs() << " with min reg num " << FMinRegNum << '\n');
+    if (MinIdx != 0)
+      std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
+    while (LU.Formulae.size() != 1) {
+      DEBUG(dbgs() << "  Deleting "; LU.Formulae.back().print(dbgs());
+            dbgs() << '\n');
+      LU.Formulae.pop_back();
+    }
+    LU.RecomputeRegs(LUIdx, RegUses);
+    assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
+    Formula &F = LU.Formulae[0];
+    DEBUG(dbgs() << "  Leaving only "; F.print(dbgs()); dbgs() << '\n');
+    // When we choose the formula, the regs become unique.
+    UniqRegs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
+    if (F.ScaledReg)
+      UniqRegs.insert(F.ScaledReg);
+  }
+  DEBUG(dbgs() << "After pre-selection:\n";
+  print_uses(dbgs()));
+}
+
+
 /// Pick a register which seems likely to be profitable, and then in any use
 /// which has any reference to that register, delete all formulae which do not
 /// reference that register.
@@ -4237,7 +4517,10 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
   NarrowSearchSpaceByDetectingSupersets();
   NarrowSearchSpaceByCollapsingUnrolledCode();
   NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
-  NarrowSearchSpaceByPickingWinnerRegs();
+  if (LSRExpNarrow)
+    NarrowSearchSpaceByDeletingCostlyFormulas();
+  else
+    NarrowSearchSpaceByPickingWinnerRegs();
 }
 
 /// This is the recursive solver.
@@ -4515,11 +4798,7 @@ Value *LSRInstance::Expand(const LSRUse &LU,
     assert(!Reg->isZero() && "Zero allocated in a base register!");
 
     // If we're expanding for a post-inc user, make the post-inc adjustment.
-    PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
-    Reg = TransformForPostIncUse(Denormalize, Reg,
-                                 LF.UserInst, LF.OperandValToReplace,
-                                 Loops, SE, DT);
-
+    Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
     Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
   }
 
@@ -4530,9 +4809,7 @@ Value *LSRInstance::Expand(const LSRUse &LU,
 
     // If we're expanding for a post-inc user, make the post-inc adjustment.
     PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
-    ScaledS = TransformForPostIncUse(Denormalize, ScaledS,
-                                     LF.UserInst, LF.OperandValToReplace,
-                                     Loops, SE, DT);
+    ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
 
     if (LU.Kind == LSRUse::ICmpZero) {
       // Expand ScaleReg as if it was part of the base regs.
@@ -4975,10 +5252,11 @@ void LSRInstance::print(raw_ostream &OS) const {
   print_uses(OS);
 }
 
-LLVM_DUMP_METHOD
-void LSRInstance::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LSRInstance::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 namespace {
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index c7f91226d222..62aa6ee48069 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -44,7 +44,11 @@ using namespace llvm;
 
 static cl::opt<unsigned>
     UnrollThreshold("unroll-threshold", cl::Hidden,
-                    cl::desc("The baseline cost threshold for loop unrolling"));
+                    cl::desc("The cost threshold for loop unrolling"));
+
+static cl::opt<unsigned> UnrollPartialThreshold(
+    "unroll-partial-threshold", cl::Hidden,
+    cl::desc("The cost threshold for partial loop unrolling"));
 
 static cl::opt<unsigned> UnrollMaxPercentThresholdBoost(
     "unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden,
@@ -106,10 +110,19 @@ static cl::opt<unsigned> FlatLoopTripCountThreshold(
              "aggressively unrolled."));
 
 static cl::opt<bool>
-    UnrollAllowPeeling("unroll-allow-peeling", cl::Hidden,
+    UnrollAllowPeeling("unroll-allow-peeling", cl::init(true), cl::Hidden,
                        cl::desc("Allows loops to be peeled when the dynamic "
                                 "trip count is known to be low."));
 
+// This option isn't ever intended to be enabled, it serves to allow
+// experiments to check the assumptions about when this kind of revisit is
+// necessary.
+static cl::opt<bool> UnrollRevisitChildLoops(
+    "unroll-revisit-child-loops", cl::Hidden,
+    cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. "
+             "This shouldn't typically be needed as child loops (or their "
+             "clones) were already visited."));
+
 /// A magic value for use with the Threshold parameter to indicate
 /// that the loop unroll should be performed regardless of how much
 /// code expansion would result.
@@ -118,16 +131,17 @@ static const unsigned NoThreshold = UINT_MAX;
 /// Gather the various unrolling parameters based on the defaults, compiler
 /// flags, TTI overrides and user specified parameters.
 static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
-    Loop *L, const TargetTransformInfo &TTI, Optional<unsigned> UserThreshold,
-    Optional<unsigned> UserCount, Optional<bool> UserAllowPartial,
-    Optional<bool> UserRuntime, Optional<bool> UserUpperBound) {
+    Loop *L, const TargetTransformInfo &TTI, int OptLevel,
+    Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
+    Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
+    Optional<bool> UserUpperBound) {
   TargetTransformInfo::UnrollingPreferences UP;
 
   // Set up the defaults
-  UP.Threshold = 150;
+  UP.Threshold = OptLevel > 2 ? 300 : 150;
   UP.MaxPercentThresholdBoost = 400;
   UP.OptSizeThreshold = 0;
-  UP.PartialThreshold = UP.Threshold;
+  UP.PartialThreshold = 150;
   UP.PartialOptSizeThreshold = 0;
   UP.Count = 0;
   UP.PeelCount = 0;
@@ -141,7 +155,7 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
   UP.AllowExpensiveTripCount = false;
   UP.Force = false;
   UP.UpperBound = false;
-  UP.AllowPeeling = false;
+  UP.AllowPeeling = true;
 
   // Override with any target specific settings
   TTI.getUnrollingPreferences(L, UP);
@@ -153,10 +167,10 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
   }
 
   // Apply any user values specified by cl::opt
-  if (UnrollThreshold.getNumOccurrences() > 0) {
+  if (UnrollThreshold.getNumOccurrences() > 0)
     UP.Threshold = UnrollThreshold;
-    UP.PartialThreshold = UnrollThreshold;
-  }
+  if (UnrollPartialThreshold.getNumOccurrences() > 0)
+    UP.PartialThreshold = UnrollPartialThreshold;
   if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0)
     UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost;
   if (UnrollMaxCount.getNumOccurrences() > 0)
@@ -495,7 +509,7 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
             KnownSucc = SI->getSuccessor(0);
           else if (ConstantInt *SimpleCondVal =
                        dyn_cast<ConstantInt>(SimpleCond))
-            KnownSucc = SI->findCaseValue(SimpleCondVal).getCaseSuccessor();
+            KnownSucc = SI->findCaseValue(SimpleCondVal)->getCaseSuccessor();
         }
       }
       if (KnownSucc) {
@@ -770,7 +784,15 @@ static bool computeUnrollCount(
     }
   }
 
-  // 4rd priority is partial unrolling.
+  // 4th priority is loop peeling
+  computePeelCount(L, LoopSize, UP, TripCount);
+  if (UP.PeelCount) {
+    UP.Runtime = false;
+    UP.Count = 1;
+    return ExplicitUnroll;
+  }
+
+  // 5th priority is partial unrolling.
   // Try partial unroll only when TripCount could be staticaly calculated.
   if (TripCount) {
     UP.Partial |= ExplicitUnroll;
@@ -833,14 +855,6 @@ static bool computeUnrollCount(
         << "Unable to fully unroll loop as directed by unroll(full) pragma "
            "because loop has a runtime trip count.");
 
-  // 5th priority is loop peeling
-  computePeelCount(L, LoopSize, UP);
-  if (UP.PeelCount) {
-    UP.Runtime = false;
-    UP.Count = 1;
-    return ExplicitUnroll;
-  }
-
   // 6th priority is runtime unrolling.
   // Don't unroll a runtime trip count loop when it is disabled.
   if (HasRuntimeUnrollDisablePragma(L)) {
@@ -914,7 +928,7 @@ static bool computeUnrollCount(
 static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
                             ScalarEvolution *SE, const TargetTransformInfo &TTI,
                             AssumptionCache &AC, OptimizationRemarkEmitter &ORE,
-                            bool PreserveLCSSA,
+                            bool PreserveLCSSA, int OptLevel,
                             Optional<unsigned> ProvidedCount,
                             Optional<unsigned> ProvidedThreshold,
                             Optional<bool> ProvidedAllowPartial,
@@ -934,7 +948,7 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   bool NotDuplicatable;
   bool Convergent;
   TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
-      L, TTI, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial,
+      L, TTI, OptLevel, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial,
       ProvidedRuntime, ProvidedUpperBound);
   // Exit early if unrolling is disabled.
   if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0))
@@ -1034,16 +1048,17 @@ namespace {
 class LoopUnroll : public LoopPass {
 public:
   static char ID; // Pass ID, replacement for typeid
-  LoopUnroll(Optional<unsigned> Threshold = None,
+  LoopUnroll(int OptLevel = 2, Optional<unsigned> Threshold = None,
              Optional<unsigned> Count = None,
              Optional<bool> AllowPartial = None, Optional<bool> Runtime = None,
              Optional<bool> UpperBound = None)
-      : LoopPass(ID), ProvidedCount(std::move(Count)),
+      : LoopPass(ID), OptLevel(OptLevel), ProvidedCount(std::move(Count)),
         ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
         ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound) {
     initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
   }
 
+  int OptLevel;
   Optional<unsigned> ProvidedCount;
   Optional<unsigned> ProvidedThreshold;
   Optional<bool> ProvidedAllowPartial;
@@ -1068,7 +1083,7 @@ public:
     OptimizationRemarkEmitter ORE(&F);
     bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
 
-    return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA,
+    return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, OptLevel,
                            ProvidedCount, ProvidedThreshold,
                            ProvidedAllowPartial, ProvidedRuntime,
                            ProvidedUpperBound);
@@ -1094,26 +1109,27 @@ INITIALIZE_PASS_DEPENDENCY(LoopPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
 
-Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial,
-                                 int Runtime, int UpperBound) {
+Pass *llvm::createLoopUnrollPass(int OptLevel, int Threshold, int Count,
+                                 int AllowPartial, int Runtime,
+                                 int UpperBound) {
   // TODO: It would make more sense for this function to take the optionals
   // directly, but that's dangerous since it would silently break out of tree
   // callers.
-  return new LoopUnroll(Threshold == -1 ? None : Optional<unsigned>(Threshold),
-                        Count == -1 ? None : Optional<unsigned>(Count),
-                        AllowPartial == -1 ? None
-                                           : Optional<bool>(AllowPartial),
-                        Runtime == -1 ? None : Optional<bool>(Runtime),
-                        UpperBound == -1 ? None : Optional<bool>(UpperBound));
+  return new LoopUnroll(
+      OptLevel, Threshold == -1 ? None : Optional<unsigned>(Threshold),
+      Count == -1 ? None : Optional<unsigned>(Count),
+      AllowPartial == -1 ? None : Optional<bool>(AllowPartial),
+      Runtime == -1 ? None : Optional<bool>(Runtime),
+      UpperBound == -1 ? None : Optional<bool>(UpperBound));
 }
 
-Pass *llvm::createSimpleLoopUnrollPass() {
-  return llvm::createLoopUnrollPass(-1, -1, 0, 0, 0);
+Pass *llvm::createSimpleLoopUnrollPass(int OptLevel) {
+  return llvm::createLoopUnrollPass(OptLevel, -1, -1, 0, 0, 0);
 }
 
 PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
                                       LoopStandardAnalysisResults &AR,
-                                      LPMUpdater &) {
+                                      LPMUpdater &Updater) {
   const auto &FAM =
       AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
   Function *F = L.getHeader()->getParent();
@@ -1124,12 +1140,84 @@ PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
     report_fatal_error("LoopUnrollPass: OptimizationRemarkEmitterAnalysis not "
                        "cached at a higher level");
 
-  bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, &AR.SE, AR.TTI, AR.AC, *ORE,
-                                 /*PreserveLCSSA*/ true, ProvidedCount,
-                                 ProvidedThreshold, ProvidedAllowPartial,
-                                 ProvidedRuntime, ProvidedUpperBound);
-
+  // Keep track of the previous loop structure so we can identify new loops
+  // created by unrolling.
+  Loop *ParentL = L.getParentLoop();
+  SmallPtrSet<Loop *, 4> OldLoops;
+  if (ParentL)
+    OldLoops.insert(ParentL->begin(), ParentL->end());
+  else
+    OldLoops.insert(AR.LI.begin(), AR.LI.end());
+
+  // The API here is quite complex to call, but there are only two interesting
+  // states we support: partial and full (or "simple") unrolling. However, to
+  // enable these things we actually pass "None" in for the optional to avoid
+  // providing an explicit choice.
+  Optional<bool> AllowPartialParam, RuntimeParam, UpperBoundParam;
+  if (!AllowPartialUnrolling)
+    AllowPartialParam = RuntimeParam = UpperBoundParam = false;
+  bool Changed = tryToUnrollLoop(
+      &L, AR.DT, &AR.LI, &AR.SE, AR.TTI, AR.AC, *ORE,
+      /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None,
+      /*Threshold*/ None, AllowPartialParam, RuntimeParam, UpperBoundParam);
   if (!Changed)
     return PreservedAnalyses::all();
+
+  // The parent must not be damaged by unrolling!
+#ifndef NDEBUG
+  if (ParentL)
+    ParentL->verifyLoop();
+#endif
+
+  // Unrolling can do several things to introduce new loops into a loop nest:
+  // - Partial unrolling clones child loops within the current loop. If it
+  //   uses a remainder, then it can also create any number of sibling loops.
+  // - Full unrolling clones child loops within the current loop but then
+  //   removes the current loop making all of the children appear to be new
+  //   sibling loops.
+  // - Loop peeling can directly introduce new sibling loops by peeling one
+  //   iteration.
+  //
+  // When a new loop appears as a sibling loop, either from peeling an
+  // iteration or fully unrolling, its nesting structure has fundamentally
+  // changed and we want to revisit it to reflect that.
+  //
+  // When unrolling has removed the current loop, we need to tell the
+  // infrastructure that it is gone.
+  //
+  // Finally, we support a debugging/testing mode where we revisit child loops
+  // as well. These are not expected to require further optimizations as either
+  // they or the loop they were cloned from have been directly visited already.
+  // But the debugging mode allows us to check this assumption.
+  bool IsCurrentLoopValid = false;
+  SmallVector<Loop *, 4> SibLoops;
+  if (ParentL)
+    SibLoops.append(ParentL->begin(), ParentL->end());
+  else
+    SibLoops.append(AR.LI.begin(), AR.LI.end());
+  erase_if(SibLoops, [&](Loop *SibLoop) {
+    if (SibLoop == &L) {
+      IsCurrentLoopValid = true;
+      return true;
+    }
+
+    // Otherwise erase the loop from the list if it was in the old loops.
+    return OldLoops.count(SibLoop) != 0;
+  });
+  Updater.addSiblingLoops(SibLoops);
+
+  if (!IsCurrentLoopValid) {
+    Updater.markLoopAsDeleted(L);
+  } else {
+    // We can only walk child loops if the current loop remained valid.
+    if (UnrollRevisitChildLoops) {
+      // Walk *all* of the child loops. This is a highly speculative mode
+      // anyways so look for any simplifications that arose from partial
+      // unrolling or peeling off of iterations.
+      SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end());
+      Updater.addChildLoops(ChildLoops);
+    }
+  }
+
   return getLoopPassPreservedAnalyses();
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
index 76fe91884c7b..a99c9999c619 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -47,6 +48,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/Support/CommandLine.h"
@@ -77,19 +79,6 @@ static cl::opt<unsigned>
 Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"),
           cl::init(100), cl::Hidden);
 
-static cl::opt<bool>
-LoopUnswitchWithBlockFrequency("loop-unswitch-with-block-frequency",
-    cl::init(false), cl::Hidden,
-    cl::desc("Enable the use of the block frequency analysis to access PGO "
-             "heuristics to minimize code growth in cold regions."));
-
-static cl::opt<unsigned>
-ColdnessThreshold("loop-unswitch-coldness-threshold", cl::init(1), cl::Hidden,
-    cl::desc("Coldness threshold in percentage. The loop header frequency "
-             "(relative to the entry frequency) is compared with this "
-             "threshold to determine if non-trivial unswitching should be "
-             "enabled."));
-
 namespace {
 
   class LUAnalysisCache {
@@ -174,13 +163,6 @@ namespace {
 
     LUAnalysisCache BranchesInfo;
 
-    bool EnabledPGO;
-
-    // BFI and ColdEntryFreq are only used when PGO and
-    // LoopUnswitchWithBlockFrequency are enabled.
-    BlockFrequencyInfo BFI;
-    BlockFrequency ColdEntryFreq;
-
     bool OptimizeForSize;
     bool redoLoop;
 
@@ -199,12 +181,14 @@ namespace {
     // NewBlocks contained cloned copy of basic blocks from LoopBlocks.
     std::vector<BasicBlock*> NewBlocks;
 
+    bool hasBranchDivergence;
+
   public:
     static char ID; // Pass ID, replacement for typeid
-    explicit LoopUnswitch(bool Os = false) :
+    explicit LoopUnswitch(bool Os = false, bool hasBranchDivergence = false) :
       LoopPass(ID), OptimizeForSize(Os), redoLoop(false),
       currentLoop(nullptr), DT(nullptr), loopHeader(nullptr),
-      loopPreheader(nullptr) {
+      loopPreheader(nullptr), hasBranchDivergence(hasBranchDivergence) {
         initializeLoopUnswitchPass(*PassRegistry::getPassRegistry());
       }
 
@@ -217,6 +201,8 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<AssumptionCacheTracker>();
       AU.addRequired<TargetTransformInfoWrapperPass>();
+      if (hasBranchDivergence)
+        AU.addRequired<DivergenceAnalysis>();
       getLoopAnalysisUsage(AU);
     }
 
@@ -255,6 +241,11 @@ namespace {
                                         TerminatorInst *TI);
 
     void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L);
+
+    /// Given that the Invariant is not equal to Val. Simplify instructions
+    /// in the loop.
+    Value *SimplifyInstructionWithNotEqual(Instruction *Inst, Value *Invariant,
+                                           Constant *Val);
   };
 }
 
@@ -381,16 +372,35 @@ INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops",
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(LoopPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
 INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops",
                       false, false)
 
-Pass *llvm::createLoopUnswitchPass(bool Os) {
-  return new LoopUnswitch(Os);
+Pass *llvm::createLoopUnswitchPass(bool Os, bool hasBranchDivergence) {
+  return new LoopUnswitch(Os, hasBranchDivergence);
 }
 
+/// Operator chain lattice.
+enum OperatorChain {
+  OC_OpChainNone,    ///< There is no operator.
+  OC_OpChainOr,      ///< There are only ORs.
+  OC_OpChainAnd,     ///< There are only ANDs.
+  OC_OpChainMixed    ///< There are ANDs and ORs.
+};
+
 /// Cond is a condition that occurs in L. If it is invariant in the loop, or has
 /// an invariant piece, return the invariant. Otherwise, return null.
+//
+/// NOTE: FindLIVLoopCondition will not return a partial LIV by walking up a
+/// mixed operator chain, as we can not reliably find a value which will simplify
+/// the operator chain. If the chain is AND-only or OR-only, we can use 0 or ~0
+/// to simplify the chain.
+///
+/// NOTE: In case a partial LIV and a mixed operator chain, we may be able to
+/// simplify the condition itself to a loop variant condition, but at the
+/// cost of creating an entirely new loop.
 static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
+                                   OperatorChain &ParentChain,
                                    DenseMap<Value *, Value *> &Cache) {
   auto CacheIt = Cache.find(Cond);
   if (CacheIt != Cache.end())
@@ -414,21 +424,53 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
     return Cond;
   }
 
+  // Walk up the operator chain to find partial invariant conditions.
   if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Cond))
     if (BO->getOpcode() == Instruction::And ||
         BO->getOpcode() == Instruction::Or) {
-      // If either the left or right side is invariant, we can unswitch on this,
-      // which will cause the branch to go away in one loop and the condition to
-      // simplify in the other one.
-      if (Value *LHS =
-              FindLIVLoopCondition(BO->getOperand(0), L, Changed, Cache)) {
-        Cache[Cond] = LHS;
-        return LHS;
+      // Given the previous operator, compute the current operator chain status.
+      OperatorChain NewChain;
+      switch (ParentChain) {
+      case OC_OpChainNone:
+        NewChain = BO->getOpcode() == Instruction::And ? OC_OpChainAnd :
+                                      OC_OpChainOr;
+        break;
+      case OC_OpChainOr:
+        NewChain = BO->getOpcode() == Instruction::Or ? OC_OpChainOr :
+                                      OC_OpChainMixed;
+        break;
+      case OC_OpChainAnd:
+        NewChain = BO->getOpcode() == Instruction::And ? OC_OpChainAnd :
+                                      OC_OpChainMixed;
+        break;
+      case OC_OpChainMixed:
+        NewChain = OC_OpChainMixed;
+        break;
       }
-      if (Value *RHS =
-              FindLIVLoopCondition(BO->getOperand(1), L, Changed, Cache)) {
-        Cache[Cond] = RHS;
-        return RHS;
+
+      // If we reach a Mixed state, we do not want to keep walking up as we can not
+      // reliably find a value that will simplify the chain. With this check, we
+      // will return null on the first sight of mixed chain and the caller will
+      // either backtrack to find partial LIV in other operand or return null.
+      if (NewChain != OC_OpChainMixed) {
+        // Update the current operator chain type before we search up the chain.
+        ParentChain = NewChain;
+        // If either the left or right side is invariant, we can unswitch on this,
+        // which will cause the branch to go away in one loop and the condition to
+        // simplify in the other one.
+        if (Value *LHS = FindLIVLoopCondition(BO->getOperand(0), L, Changed,
+                                              ParentChain, Cache)) {
+          Cache[Cond] = LHS;
+          return LHS;
+        }
+        // We did not manage to find a partial LIV in operand(0). Backtrack and try
+        // operand(1).
+        ParentChain = NewChain;
+        if (Value *RHS = FindLIVLoopCondition(BO->getOperand(1), L, Changed,
+                                              ParentChain, Cache)) {
+          Cache[Cond] = RHS;
+          return RHS;
+        }
       }
     }
 
@@ -436,9 +478,21 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
   return nullptr;
 }
 
-static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) {
+/// Cond is a condition that occurs in L. If it is invariant in the loop, or has
+/// an invariant piece, return the invariant along with the operator chain type.
+/// Otherwise, return null.
+static std::pair<Value *, OperatorChain> FindLIVLoopCondition(Value *Cond,
+                                                              Loop *L,
+                                                              bool &Changed) {
   DenseMap<Value *, Value *> Cache;
-  return FindLIVLoopCondition(Cond, L, Changed, Cache);
+  OperatorChain OpChain = OC_OpChainNone;
+  Value *FCond = FindLIVLoopCondition(Cond, L, Changed, OpChain, Cache);
+
+  // In case we do find a LIV, it can not be obtained by walking up a mixed
+  // operator chain.
+  assert((!FCond || OpChain != OC_OpChainMixed) &&
+        "Do not expect a partial LIV with mixed operator chain");
+  return {FCond, OpChain};
 }
 
 bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
@@ -457,19 +511,6 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
   if (SanitizeMemory)
     computeLoopSafetyInfo(&SafetyInfo, L);
 
-  EnabledPGO = F->getEntryCount().hasValue();
-
-  if (LoopUnswitchWithBlockFrequency && EnabledPGO) {
-    BranchProbabilityInfo BPI(*F, *LI);
-    BFI.calculate(*L->getHeader()->getParent(), BPI, *LI);
-
-    // Use BranchProbability to compute a minimum frequency based on
-    // function entry baseline frequency. Loops with headers below this
-    // frequency are considered as cold.
-    const BranchProbability ColdProb(ColdnessThreshold, 100);
-    ColdEntryFreq = BlockFrequency(BFI.getEntryFreq()) * ColdProb;
-  }
-
   bool Changed = false;
   do {
     assert(currentLoop->isLCSSAForm(*DT));
@@ -581,19 +622,9 @@ bool LoopUnswitch::processCurrentLoop() {
       loopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize))
     return false;
 
-  if (LoopUnswitchWithBlockFrequency && EnabledPGO) {
-    // Compute the weighted frequency of the hottest block in the
-    // loop (loopHeader in this case since inner loops should be
-    // processed before outer loop). If it is less than ColdFrequency,
-    // we should not unswitch.
-    BlockFrequency LoopEntryFreq = BFI.getBlockFreq(loopHeader);
-    if (LoopEntryFreq < ColdEntryFreq)
-      return false;
-  }
-
   for (IntrinsicInst *Guard : Guards) {
     Value *LoopCond =
-        FindLIVLoopCondition(Guard->getOperand(0), currentLoop, Changed);
+        FindLIVLoopCondition(Guard->getOperand(0), currentLoop, Changed).first;
     if (LoopCond &&
         UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) {
       // NB! Unswitching (if successful) could have erased some of the
@@ -634,7 +665,7 @@ bool LoopUnswitch::processCurrentLoop() {
         // See if this, or some part of it, is loop invariant.  If so, we can
         // unswitch on it if we desire.
         Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
-                                               currentLoop, Changed);
+                                               currentLoop, Changed).first;
         if (LoopCond &&
             UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context), TI)) {
           ++NumBranches;
@@ -642,24 +673,48 @@ bool LoopUnswitch::processCurrentLoop() {
         }
       }
     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
-      Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
-                                             currentLoop, Changed);
+      Value *SC = SI->getCondition();
+      Value *LoopCond;
+      OperatorChain OpChain;
+      std::tie(LoopCond, OpChain) =
+        FindLIVLoopCondition(SC, currentLoop, Changed);
+
       unsigned NumCases = SI->getNumCases();
       if (LoopCond && NumCases) {
         // Find a value to unswitch on:
         // FIXME: this should chose the most expensive case!
         // FIXME: scan for a case with a non-critical edge?
         Constant *UnswitchVal = nullptr;
-
-        // Do not process same value again and again.
-        // At this point we have some cases already unswitched and
-        // some not yet unswitched. Let's find the first not yet unswitched one.
-        for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
-             i != e; ++i) {
-          Constant *UnswitchValCandidate = i.getCaseValue();
-          if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) {
-            UnswitchVal = UnswitchValCandidate;
-            break;
+        // Find a case value such that at least one case value is unswitched
+        // out.
+        if (OpChain == OC_OpChainAnd) {
+          // If the chain only has ANDs and the switch has a case value of 0.
+          // Dropping in a 0 to the chain will unswitch out the 0-casevalue.
+          auto *AllZero = cast<ConstantInt>(Constant::getNullValue(SC->getType()));
+          if (BranchesInfo.isUnswitched(SI, AllZero))
+            continue;
+          // We are unswitching 0 out.
+          UnswitchVal = AllZero;
+        } else if (OpChain == OC_OpChainOr) {
+          // If the chain only has ORs and the switch has a case value of ~0.
+          // Dropping in a ~0 to the chain will unswitch out the ~0-casevalue.
+          auto *AllOne = cast<ConstantInt>(Constant::getAllOnesValue(SC->getType()));
+          if (BranchesInfo.isUnswitched(SI, AllOne))
+            continue;
+          // We are unswitching ~0 out.
+          UnswitchVal = AllOne;
+        } else {
+          assert(OpChain == OC_OpChainNone && 
+                 "Expect to unswitch on trivial chain");
+          // Do not process same value again and again.
+          // At this point we have some cases already unswitched and
+          // some not yet unswitched. Let's find the first not yet unswitched one.
+          for (auto Case : SI->cases()) {
+            Constant *UnswitchValCandidate = Case.getCaseValue();
+            if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) {
+              UnswitchVal = UnswitchValCandidate;
+              break;
+            }
           }
         }
 
@@ -668,6 +723,11 @@ bool LoopUnswitch::processCurrentLoop() {
 
         if (UnswitchIfProfitable(LoopCond, UnswitchVal)) {
           ++NumSwitches;
+          // In case of a full LIV, UnswitchVal is the value we unswitched out.
+          // In case of a partial LIV, we only unswitch when its an AND-chain
+          // or OR-chain. In both cases switch input value simplifies to
+          // UnswitchVal.
+          BranchesInfo.setUnswitched(SI, UnswitchVal);
           return true;
         }
       }
@@ -678,7 +738,7 @@ bool LoopUnswitch::processCurrentLoop() {
          BBI != E; ++BBI)
       if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) {
         Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
-                                               currentLoop, Changed);
+                                               currentLoop, Changed).first;
         if (LoopCond && UnswitchIfProfitable(LoopCond,
                                              ConstantInt::getTrue(Context))) {
           ++NumSelects;
@@ -753,6 +813,15 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val,
                  << ". Cost too high.\n");
     return false;
   }
+  if (hasBranchDivergence &&
+      getAnalysis<DivergenceAnalysis>().isDivergent(LoopCond)) {
+    DEBUG(dbgs() << "NOT unswitching loop %"
+                 << currentLoop->getHeader()->getName()
+                 << " at non-trivial condition '" << *Val
+                 << "' == " << *LoopCond << "\n"
+                 << ". Condition is divergent.\n");
+    return false;
+  }
 
   UnswitchNontrivialCondition(LoopCond, Val, currentLoop, TI);
   return true;
@@ -899,7 +968,6 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
       if (I.mayHaveSideEffects())
         return false;
 
-    // FIXME: add check for constant foldable switch instructions.
     if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) {
       if (BI->isUnconditional()) {
         CurrentBB = BI->getSuccessor(0);
@@ -911,7 +979,16 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
         // Found a trivial condition candidate: non-foldable conditional branch.
         break;
       }
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
+      // At this point, any constant-foldable instructions should have probably
+      // been folded.
+      ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition());
+      if (!Cond)
+        break;
+      // Find the target block we are definitely going to.
+      CurrentBB = SI->findCaseValue(Cond)->getCaseSuccessor();
     } else {
+      // We do not understand these terminator instructions.
       break;
     }
 
@@ -929,7 +1006,7 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
       return false;
 
     Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
-                                           currentLoop, Changed);
+                                           currentLoop, Changed).first;
 
     // Unswitch only if the trivial condition itself is an LIV (not
     // partial LIV which could occur in and/or)
@@ -960,7 +1037,7 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
   } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
     // If this isn't switching on an invariant condition, we can't unswitch it.
     Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
-                                           currentLoop, Changed);
+                                           currentLoop, Changed).first;
 
     // Unswitch only if the trivial condition itself is an LIV (not
     // partial LIV which could occur in and/or)
@@ -973,13 +1050,12 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
     // this.
     // Note that we can't trivially unswitch on the default case or
     // on already unswitched cases.
-    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
-         i != e; ++i) {
+    for (auto Case : SI->cases()) {
       BasicBlock *LoopExitCandidate;
-      if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop,
-                                               i.getCaseSuccessor()))) {
+      if ((LoopExitCandidate =
+               isTrivialLoopExitBlock(currentLoop, Case.getCaseSuccessor()))) {
         // Okay, we found a trivial case, remember the value that is trivial.
-        ConstantInt *CaseVal = i.getCaseValue();
+        ConstantInt *CaseVal = Case.getCaseValue();
 
         // Check that it was not unswitched before, since already unswitched
         // trivial vals are looks trivial too.
@@ -998,6 +1074,9 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
 
     UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB,
                              nullptr);
+
+    // We are only unswitching full LIV.
+    BranchesInfo.setUnswitched(SI, CondVal);
     ++NumSwitches;
     return true;
   }
@@ -1253,18 +1332,38 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
     if (!UI || !L->contains(UI))
       continue;
 
-    Worklist.push_back(UI);
+    // At this point, we know LIC is definitely not Val. Try to use some simple
+    // logic to simplify the user w.r.t. to the context.
+    if (Value *Replacement = SimplifyInstructionWithNotEqual(UI, LIC, Val)) {
+      if (LI->replacementPreservesLCSSAForm(UI, Replacement)) {
+        // This in-loop instruction has been simplified w.r.t. its context,
+        // i.e. LIC != Val, make sure we propagate its replacement value to
+        // all its users.
+        //  
+        // We can not yet delete UI, the LIC user, yet, because that would invalidate
+        // the LIC->users() iterator !. However, we can make this instruction
+        // dead by replacing all its users and push it onto the worklist so that
+        // it can be properly deleted and its operands simplified. 
+        UI->replaceAllUsesWith(Replacement);
+      }
+    }
 
-    // TODO: We could do other simplifications, for example, turning
-    // 'icmp eq LIC, Val' -> false.
+    // This is a LIC user, push it into the worklist so that SimplifyCode can
+    // attempt to simplify it.
+    Worklist.push_back(UI);
 
     // If we know that LIC is not Val, use this info to simplify code.
     SwitchInst *SI = dyn_cast<SwitchInst>(UI);
     if (!SI || !isa<ConstantInt>(Val)) continue;
 
-    SwitchInst::CaseIt DeadCase = SI->findCaseValue(cast<ConstantInt>(Val));
+    // NOTE: if a case value for the switch is unswitched out, we record it
+    // after the unswitch finishes. We can not record it here as the switch
+    // is not a direct user of the partial LIV.
+    SwitchInst::CaseHandle DeadCase =
+        *SI->findCaseValue(cast<ConstantInt>(Val));
     // Default case is live for multiple values.
-    if (DeadCase == SI->case_default()) continue;
+    if (DeadCase == *SI->case_default())
+      continue;
 
     // Found a dead case value.  Don't remove PHI nodes in the
     // successor if they become single-entry, those PHI nodes may
@@ -1274,8 +1373,6 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
     BasicBlock *SISucc = DeadCase.getCaseSuccessor();
     BasicBlock *Latch = L->getLoopLatch();
 
-    BranchesInfo.setUnswitched(SI, Val);
-
     if (!SI->findCaseDest(SISucc)) continue;  // Edge is critical.
     // If the DeadCase successor dominates the loop latch, then the
     // transformation isn't safe since it will delete the sole predecessor edge
@@ -1397,3 +1494,27 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
     }
   }
 }
+
+/// Simple simplifications we can do given the information that Cond is
+/// definitely not equal to Val.
+Value *LoopUnswitch::SimplifyInstructionWithNotEqual(Instruction *Inst,
+                                                     Value *Invariant,
+                                                     Constant *Val) {
+  // icmp eq cond, val -> false
+  ICmpInst *CI = dyn_cast<ICmpInst>(Inst);
+  if (CI && CI->isEquality()) {
+    Value *Op0 = CI->getOperand(0);
+    Value *Op1 = CI->getOperand(1);
+    if ((Op0 == Invariant && Op1 == Val) || (Op0 == Val && Op1 == Invariant)) {
+      LLVMContext &Ctx = Inst->getContext();
+      if (CI->getPredicate() == CmpInst::ICMP_EQ)
+        return ConstantInt::getFalse(Ctx);
+      else 
+        return ConstantInt::getTrue(Ctx);
+     }
+  }
+
+  // FIXME: there may be other opportunities, e.g. comparison with floating
+  // point, or Invariant - Val != 0, etc.
+  return nullptr;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
index 52975ef35153..a143b9a3c645 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -67,11 +67,11 @@ static bool handleSwitchExpect(SwitchInst &SI) {
   if (!ExpectedValue)
     return false;
 
-  SwitchInst::CaseIt Case = SI.findCaseValue(ExpectedValue);
+  SwitchInst::CaseHandle Case = *SI.findCaseValue(ExpectedValue);
   unsigned n = SI.getNumCases(); // +1 for default case.
   SmallVector<uint32_t, 16> Weights(n + 1, UnlikelyBranchWeight);
 
-  if (Case == SI.case_default())
+  if (Case == *SI.case_default())
     Weights[0] = LikelyBranchWeight;
   else
     Weights[Case.getCaseIndex() + 1] = LikelyBranchWeight;
diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 1b590140f70a..a3f3f25c1e0f 100644
--- a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -12,20 +12,49 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
+#include <cassert>
+#include <cstdint>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "memcpyopt"
@@ -119,6 +148,7 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
   return true;
 }
 
+namespace {
 
 /// Represents a range of memset'd bytes with the ByteVal value.
 /// This allows us to analyze stores like:
@@ -130,7 +160,6 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
 /// the first store, we make a range [1, 2).  The second store extends the range
 /// to [0, 2).  The third makes a new range [2, 3).  The fourth store joins the
 /// two ranges into [0, 3) which is memset'able.
-namespace {
 struct MemsetRange {
   // Start/End - A semi range that describes the span that this range covers.
   // The range is closed at the start and open at the end: [Start, End).
@@ -148,7 +177,8 @@ struct MemsetRange {
 
   bool isProfitableToUseMemset(const DataLayout &DL) const;
 };
-} // end anon namespace
+
+} // end anonymous namespace
 
 bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
   // If we found more than 4 stores to merge or 16 bytes, use memset.
@@ -192,13 +222,14 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
   return TheStores.size() > NumPointerStores+NumByteStores;
 }
 
-
 namespace {
+
 class MemsetRanges {
   /// A sorted list of the memset ranges.
   SmallVector<MemsetRange, 8> Ranges;
   typedef SmallVectorImpl<MemsetRange>::iterator range_iterator;
   const DataLayout &DL;
+
 public:
   MemsetRanges(const DataLayout &DL) : DL(DL) {}
 
@@ -231,8 +262,7 @@ public:
 
 };
 
-} // end anon namespace
-
+} // end anonymous namespace
 
 /// Add a new store to the MemsetRanges data structure.  This adds a
 /// new range for the specified store at the specified offset, merging into
@@ -299,48 +329,36 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
 //===----------------------------------------------------------------------===//
 
 namespace {
-  class MemCpyOptLegacyPass : public FunctionPass {
-    MemCpyOptPass Impl;
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    MemCpyOptLegacyPass() : FunctionPass(ID) {
-      initializeMemCpyOptLegacyPassPass(*PassRegistry::getPassRegistry());
-    }
 
-    bool runOnFunction(Function &F) override;
-
-  private:
-    // This transformation requires dominator postdominator info
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesCFG();
-      AU.addRequired<AssumptionCacheTracker>();
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<MemoryDependenceWrapperPass>();
-      AU.addRequired<AAResultsWrapperPass>();
-      AU.addRequired<TargetLibraryInfoWrapperPass>();
-      AU.addPreserved<GlobalsAAWrapperPass>();
-      AU.addPreserved<MemoryDependenceWrapperPass>();
-    }
+class MemCpyOptLegacyPass : public FunctionPass {
+  MemCpyOptPass Impl;
 
-    // Helper functions
-    bool processStore(StoreInst *SI, BasicBlock::iterator &BBI);
-    bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI);
-    bool processMemCpy(MemCpyInst *M);
-    bool processMemMove(MemMoveInst *M);
-    bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc,
-                              uint64_t cpyLen, unsigned cpyAlign, CallInst *C);
-    bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep);
-    bool processMemSetMemCpyDependence(MemCpyInst *M, MemSetInst *MDep);
-    bool performMemCpyToMemSetOptzn(MemCpyInst *M, MemSetInst *MDep);
-    bool processByValArgument(CallSite CS, unsigned ArgNo);
-    Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr,
-                                      Value *ByteVal);
-
-    bool iterateOnFunction(Function &F);
-  };
+public:
+  static char ID; // Pass identification, replacement for typeid
 
-  char MemCpyOptLegacyPass::ID = 0;
-}
+  MemCpyOptLegacyPass() : FunctionPass(ID) {
+    initializeMemCpyOptLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+
+private:
+  // This transformation requires dominator postdominator info
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<MemoryDependenceWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<MemoryDependenceWrapperPass>();
+  }
+};
+
+char MemCpyOptLegacyPass::ID = 0;
+
+} // end anonymous namespace
 
 /// The public interface to this file...
 FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOptLegacyPass(); }
@@ -523,14 +541,15 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
     if (Args.erase(C))
       NeedLift = true;
     else if (MayAlias) {
-      NeedLift = any_of(MemLocs, [C, &AA](const MemoryLocation &ML) {
+      NeedLift = llvm::any_of(MemLocs, [C, &AA](const MemoryLocation &ML) {
         return AA.getModRefInfo(C, ML);
       });
 
       if (!NeedLift)
-        NeedLift = any_of(CallSites, [C, &AA](const ImmutableCallSite &CS) {
-          return AA.getModRefInfo(C, CS);
-        });
+        NeedLift =
+            llvm::any_of(CallSites, [C, &AA](const ImmutableCallSite &CS) {
+              return AA.getModRefInfo(C, CS);
+            });
     }
 
     if (!NeedLift)
@@ -567,7 +586,7 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
   }
 
   // We made it, we need to lift
-  for (auto *I : reverse(ToLift)) {
+  for (auto *I : llvm::reverse(ToLift)) {
     DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n");
     I->moveBefore(P);
   }
@@ -761,7 +780,6 @@ bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
   return false;
 }
 
-
 /// Takes a memcpy and a call that it depends on,
 /// and checks for the possibility of a call slot optimization by having
 /// the call write its result directly into the destination of the memcpy.
@@ -914,6 +932,17 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
   if (MR != MRI_NoModRef)
     return false;
 
+  // We can't create address space casts here because we don't know if they're
+  // safe for the target.
+  if (cpySrc->getType()->getPointerAddressSpace() !=
+      cpyDest->getType()->getPointerAddressSpace())
+    return false;
+  for (unsigned i = 0; i < CS.arg_size(); ++i)
+    if (CS.getArgument(i)->stripPointerCasts() == cpySrc &&
+        cpySrc->getType()->getPointerAddressSpace() !=
+        CS.getArgument(i)->getType()->getPointerAddressSpace())
+      return false;
+
   // All the checks have passed, so do the transformation.
   bool changedArgument = false;
   for (unsigned i = 0; i < CS.arg_size(); ++i)
@@ -1240,7 +1269,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M) {
 bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
   AliasAnalysis &AA = LookupAliasAnalysis();
 
-  if (!TLI->has(LibFunc::memmove))
+  if (!TLI->has(LibFunc_memmove))
     return false;
 
   // See if the pointers alias.
@@ -1306,6 +1335,11 @@ bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) {
                                  CS.getInstruction(), &AC, &DT) < ByValAlign)
     return false;
 
+  // The address space of the memcpy source must match the byval argument
+  if (MDep->getSource()->getType()->getPointerAddressSpace() !=
+      ByValArg->getType()->getPointerAddressSpace())
+    return false;
+
   // Verify that the copied-from memory doesn't change in between the memcpy and
   // the byval call.
   //    memcpy(a <- b)
@@ -1375,7 +1409,6 @@ bool MemCpyOptPass::iterateOnFunction(Function &F) {
 }
 
 PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
-
   auto &MD = AM.getResult<MemoryDependenceAnalysis>(F);
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
 
@@ -1393,7 +1426,9 @@ PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
                             LookupAssumptionCache, LookupDomTree);
   if (!MadeChange)
     return PreservedAnalyses::all();
+
   PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
   PA.preserve<MemoryDependenceAnalysis>();
   return PA;
@@ -1414,10 +1449,10 @@ bool MemCpyOptPass::runImpl(
   // If we don't have at least memset and memcpy, there is little point of doing
   // anything here.  These are required by a freestanding implementation, so if
   // even they are disabled, there is no point in trying hard.
-  if (!TLI->has(LibFunc::memset) || !TLI->has(LibFunc::memcpy))
+  if (!TLI->has(LibFunc_memset) || !TLI->has(LibFunc_memcpy))
     return false;
 
-  while (1) {
+  while (true) {
     if (!iterateOnFunction(F))
       break;
     MadeChange = true;
diff --git a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index 6a64c6b3619c..acd3ef6791be 100644
--- a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -19,6 +19,8 @@
 // thinks it safe to do so.  This optimization helps with eg. hiding load
 // latencies, triggering if-conversion, and reducing static code size.
 //
+// NOTE: This code no longer performs load hoisting, it is subsumed by GVNHoist.
+//
 //===----------------------------------------------------------------------===//
 //
 //
@@ -87,7 +89,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
 
 using namespace llvm;
 
@@ -118,16 +119,6 @@ private:
   void removeInstruction(Instruction *Inst);
   BasicBlock *getDiamondTail(BasicBlock *BB);
   bool isDiamondHead(BasicBlock *BB);
-  // Routines for hoisting loads
-  bool isLoadHoistBarrierInRange(const Instruction &Start,
-                                 const Instruction &End, LoadInst *LI,
-                                 bool SafeToLoadUnconditionally);
-  LoadInst *canHoistFromBlock(BasicBlock *BB, LoadInst *LI);
-  void hoistInstruction(BasicBlock *BB, Instruction *HoistCand,
-                        Instruction *ElseInst);
-  bool isSafeToHoist(Instruction *I) const;
-  bool hoistLoad(BasicBlock *BB, LoadInst *HoistCand, LoadInst *ElseInst);
-  bool mergeLoads(BasicBlock *BB);
   // Routines for sinking stores
   StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI);
   PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1);
@@ -188,169 +179,6 @@ bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) {
   return true;
 }
 
-///
-/// \brief True when instruction is a hoist barrier for a load
-///
-/// Whenever an instruction could possibly modify the value
-/// being loaded or protect against the load from happening
-/// it is considered a hoist barrier.
-///
-bool MergedLoadStoreMotion::isLoadHoistBarrierInRange(
-    const Instruction &Start, const Instruction &End, LoadInst *LI,
-    bool SafeToLoadUnconditionally) {
-  if (!SafeToLoadUnconditionally)
-    for (const Instruction &Inst :
-         make_range(Start.getIterator(), End.getIterator()))
-      if (!isGuaranteedToTransferExecutionToSuccessor(&Inst))
-        return true;
-  MemoryLocation Loc = MemoryLocation::get(LI);
-  return AA->canInstructionRangeModRef(Start, End, Loc, MRI_Mod);
-}
-
-///
-/// \brief Decide if a load can be hoisted
-///
-/// When there is a load in \p BB to the same address as \p LI
-/// and it can be hoisted from \p BB, return that load.
-/// Otherwise return Null.
-///
-LoadInst *MergedLoadStoreMotion::canHoistFromBlock(BasicBlock *BB1,
-                                                   LoadInst *Load0) {
-  BasicBlock *BB0 = Load0->getParent();
-  BasicBlock *Head = BB0->getSinglePredecessor();
-  bool SafeToLoadUnconditionally = isSafeToLoadUnconditionally(
-      Load0->getPointerOperand(), Load0->getAlignment(),
-      Load0->getModule()->getDataLayout(),
-      /*ScanFrom=*/Head->getTerminator());
-  for (BasicBlock::iterator BBI = BB1->begin(), BBE = BB1->end(); BBI != BBE;
-       ++BBI) {
-    Instruction *Inst = &*BBI;
-
-    // Only merge and hoist loads when their result in used only in BB
-    auto *Load1 = dyn_cast<LoadInst>(Inst);
-    if (!Load1 || Inst->isUsedOutsideOfBlock(BB1))
-      continue;
-
-    MemoryLocation Loc0 = MemoryLocation::get(Load0);
-    MemoryLocation Loc1 = MemoryLocation::get(Load1);
-    if (Load0->isSameOperationAs(Load1) && AA->isMustAlias(Loc0, Loc1) &&
-        !isLoadHoistBarrierInRange(BB1->front(), *Load1, Load1,
-                                   SafeToLoadUnconditionally) &&
-        !isLoadHoistBarrierInRange(BB0->front(), *Load0, Load0,
-                                   SafeToLoadUnconditionally)) {
-      return Load1;
-    }
-  }
-  return nullptr;
-}
-
-///
-/// \brief Merge two equivalent instructions \p HoistCand and \p ElseInst into
-/// \p BB
-///
-/// BB is the head of a diamond
-///
-void MergedLoadStoreMotion::hoistInstruction(BasicBlock *BB,
-                                             Instruction *HoistCand,
-                                             Instruction *ElseInst) {
-  DEBUG(dbgs() << " Hoist Instruction into BB \n"; BB->dump();
-        dbgs() << "Instruction Left\n"; HoistCand->dump(); dbgs() << "\n";
-        dbgs() << "Instruction Right\n"; ElseInst->dump(); dbgs() << "\n");
-  // Hoist the instruction.
-  assert(HoistCand->getParent() != BB);
-
-  // Intersect optional metadata.
-  HoistCand->andIRFlags(ElseInst);
-  HoistCand->dropUnknownNonDebugMetadata();
-
-  // Prepend point for instruction insert
-  Instruction *HoistPt = BB->getTerminator();
-
-  // Merged instruction
-  Instruction *HoistedInst = HoistCand->clone();
-
-  // Hoist instruction.
-  HoistedInst->insertBefore(HoistPt);
-
-  HoistCand->replaceAllUsesWith(HoistedInst);
-  removeInstruction(HoistCand);
-  // Replace the else block instruction.
-  ElseInst->replaceAllUsesWith(HoistedInst);
-  removeInstruction(ElseInst);
-}
-
-///
-/// \brief Return true if no operand of \p I is defined in I's parent block
-///
-bool MergedLoadStoreMotion::isSafeToHoist(Instruction *I) const {
-  BasicBlock *Parent = I->getParent();
-  for (Use &U : I->operands())
-    if (auto *Instr = dyn_cast<Instruction>(&U))
-      if (Instr->getParent() == Parent)
-        return false;
-  return true;
-}
-
-///
-/// \brief Merge two equivalent loads and GEPs and hoist into diamond head
-///
-bool MergedLoadStoreMotion::hoistLoad(BasicBlock *BB, LoadInst *L0,
-                                      LoadInst *L1) {
-  // Only one definition?
-  auto *A0 = dyn_cast<Instruction>(L0->getPointerOperand());
-  auto *A1 = dyn_cast<Instruction>(L1->getPointerOperand());
-  if (A0 && A1 && A0->isIdenticalTo(A1) && isSafeToHoist(A0) &&
-      A0->hasOneUse() && (A0->getParent() == L0->getParent()) &&
-      A1->hasOneUse() && (A1->getParent() == L1->getParent()) &&
-      isa<GetElementPtrInst>(A0)) {
-    DEBUG(dbgs() << "Hoist Instruction into BB \n"; BB->dump();
-          dbgs() << "Instruction Left\n"; L0->dump(); dbgs() << "\n";
-          dbgs() << "Instruction Right\n"; L1->dump(); dbgs() << "\n");
-    hoistInstruction(BB, A0, A1);
-    hoistInstruction(BB, L0, L1);
-    return true;
-  }
-  return false;
-}
-
-///
-/// \brief Try to hoist two loads to same address into diamond header
-///
-/// Starting from a diamond head block, iterate over the instructions in one
-/// successor block and try to match a load in the second successor.
-///
-bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) {
-  bool MergedLoads = false;
-  assert(isDiamondHead(BB));
-  BranchInst *BI = cast<BranchInst>(BB->getTerminator());
-  BasicBlock *Succ0 = BI->getSuccessor(0);
-  BasicBlock *Succ1 = BI->getSuccessor(1);
-  // #Instructions in Succ1 for Compile Time Control
-  int Size1 = Succ1->size();
-  int NLoads = 0;
-  for (BasicBlock::iterator BBI = Succ0->begin(), BBE = Succ0->end();
-       BBI != BBE;) {
-    Instruction *I = &*BBI;
-    ++BBI;
-
-    // Don't move non-simple (atomic, volatile) loads.
-    auto *L0 = dyn_cast<LoadInst>(I);
-    if (!L0 || !L0->isSimple() || L0->isUsedOutsideOfBlock(Succ0))
-      continue;
-
-    ++NLoads;
-    if (NLoads * Size1 >= MagicCompileTimeControl)
-      break;
-    if (LoadInst *L1 = canHoistFromBlock(Succ1, L0)) {
-      bool Res = hoistLoad(BB, L0, L1);
-      MergedLoads |= Res;
-      // Don't attempt to hoist above loads that had not been hoisted.
-      if (!Res)
-        break;
-    }
-  }
-  return MergedLoads;
-}
 
 ///
 /// \brief True when instruction is a sink barrier for a store
@@ -534,7 +362,6 @@ bool MergedLoadStoreMotion::run(Function &F, MemoryDependenceResults *MD,
     // Hoist equivalent loads and sink stores
     // outside diamonds when possible
     if (isDiamondHead(BB)) {
-      Changed |= mergeLoads(BB);
       Changed |= mergeStores(getDiamondTail(BB));
     }
   }
@@ -596,8 +423,8 @@ MergedLoadStoreMotionPass::run(Function &F, FunctionAnalysisManager &AM) {
   if (!Impl.run(F, MD, AA))
     return PreservedAnalyses::all();
 
-  // FIXME: This should also 'preserve the CFG'.
   PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
   PA.preserve<MemoryDependenceAnalysis>();
   return PA;
diff --git a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
index 0a3bf7b4c31b..c5bf2f28d185 100644
--- a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -156,20 +156,12 @@ PreservedAnalyses NaryReassociatePass::run(Function &F,
   auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
   auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
 
-  bool Changed = runImpl(F, AC, DT, SE, TLI, TTI);
-
-  // FIXME: We need to invalidate this to avoid PR28400. Is there a better
-  // solution?
-  AM.invalidate<ScalarEvolutionAnalysis>(F);
-
-  if (!Changed)
+  if (!runImpl(F, AC, DT, SE, TLI, TTI))
     return PreservedAnalyses::all();
 
-  // FIXME: This should also 'preserve the CFG'.
   PreservedAnalyses PA;
-  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<ScalarEvolutionAnalysis>();
-  PA.preserve<TargetLibraryAnalysis>();
   return PA;
 }
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp b/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 57e6e3ddad94..3d8ce888867e 100644
--- a/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -17,6 +17,27 @@
 /// "A Sparse Algorithm for Predicated Global Value Numbering" from
 /// Karthik Gargi.
 ///
+/// A brief overview of the algorithm: The algorithm is essentially the same as
+/// the standard RPO value numbering algorithm (a good reference is the paper
+/// "SCC based value numbering" by L. Taylor Simpson) with one major difference:
+/// The RPO algorithm proceeds, on every iteration, to process every reachable
+/// block and every instruction in that block.  This is because the standard RPO
+/// algorithm does not track what things have the same value number, it only
+/// tracks what the value number of a given operation is (the mapping is
+/// operation -> value number).  Thus, when a value number of an operation
+/// changes, it must reprocess everything to ensure all uses of a value number
+/// get updated properly.  In constrast, the sparse algorithm we use *also*
+/// tracks what operations have a given value number (IE it also tracks the
+/// reverse mapping from value number -> operations with that value number), so
+/// that it only needs to reprocess the instructions that are affected when
+/// something's value number changes.  The rest of the algorithm is devoted to
+/// performing symbolic evaluation, forward propagation, and simplification of
+/// operations based on the value numbers deduced so far.
+///
+/// We also do not perform elimination by using any published algorithm.  All
+/// published algorithms are O(Instructions). Instead, we use a technique that
+/// is O(number of operations with the same value number), enabling us to skip
+/// trying to eliminate things that have unique value numbers.
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/NewGVN.h"
@@ -40,13 +61,10 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/Analysis/PHITransAddr.h"
+#include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -55,24 +73,25 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/PredIteratorCache.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVNExpression.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/MemorySSA.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/PredicateInfo.h"
+#include "llvm/Transforms/Utils/VNCoercion.h"
+#include <numeric>
 #include <unordered_map>
 #include <utility>
 #include <vector>
 using namespace llvm;
 using namespace PatternMatch;
 using namespace llvm::GVNExpression;
-
+using namespace llvm::VNCoercion;
 #define DEBUG_TYPE "newgvn"
 
 STATISTIC(NumGVNInstrDeleted, "Number of instructions deleted");
@@ -87,6 +106,15 @@ STATISTIC(NumGVNAvoidedSortedLeaderChanges,
           "Number of avoided sorted leader changes");
 STATISTIC(NumGVNNotMostDominatingLeader,
           "Number of times a member dominated it's new classes' leader");
+STATISTIC(NumGVNDeadStores, "Number of redundant/dead stores eliminated");
+DEBUG_COUNTER(VNCounter, "newgvn-vn",
+              "Controls which instructions are value numbered")
+
+// Currently store defining access refinement is too slow due to basicaa being
+// egregiously slow.  This flag lets us keep it working while we work on this
+// issue.
+static cl::opt<bool> EnableStoreRefinement("enable-store-refinement",
+                                           cl::init(false), cl::Hidden);
 
 //===----------------------------------------------------------------------===//
 //                                GVN Pass
@@ -105,6 +133,77 @@ PHIExpression::~PHIExpression() = default;
 }
 }
 
+// Tarjan's SCC finding algorithm with Nuutila's improvements
+// SCCIterator is actually fairly complex for the simple thing we want.
+// It also wants to hand us SCC's that are unrelated to the phi node we ask
+// about, and have us process them there or risk redoing work.
+// Graph traits over a filter iterator also doesn't work that well here.
+// This SCC finder is specialized to walk use-def chains, and only follows instructions,
+// not generic values (arguments, etc).
+struct TarjanSCC {
+
+  TarjanSCC() : Components(1) {}
+
+  void Start(const Instruction *Start) {
+    if (Root.lookup(Start) == 0)
+      FindSCC(Start);
+  }
+
+  const SmallPtrSetImpl<const Value *> &getComponentFor(const Value *V) const {
+    unsigned ComponentID = ValueToComponent.lookup(V);
+
+    assert(ComponentID > 0 &&
+           "Asking for a component for a value we never processed");
+    return Components[ComponentID];
+  }
+
+private:
+  void FindSCC(const Instruction *I) {
+    Root[I] = ++DFSNum;
+    // Store the DFS Number we had before it possibly gets incremented.
+    unsigned int OurDFS = DFSNum;
+    for (auto &Op : I->operands()) {
+      if (auto *InstOp = dyn_cast<Instruction>(Op)) {
+        if (Root.lookup(Op) == 0)
+          FindSCC(InstOp);
+        if (!InComponent.count(Op))
+          Root[I] = std::min(Root.lookup(I), Root.lookup(Op));
+      }
+    }
+    // See if we really were the root of a component, by seeing if we still have our DFSNumber.
+    // If we do, we are the root of the component, and we have completed a component. If we do not,
+    // we are not the root of a component, and belong on the component stack.
+    if (Root.lookup(I) == OurDFS) {
+      unsigned ComponentID = Components.size();
+      Components.resize(Components.size() + 1);
+      auto &Component = Components.back();
+      Component.insert(I);
+      DEBUG(dbgs() << "Component root is " << *I << "\n");
+      InComponent.insert(I);
+      ValueToComponent[I] = ComponentID;
+      // Pop a component off the stack and label it.
+      while (!Stack.empty() && Root.lookup(Stack.back()) >= OurDFS) {
+        auto *Member = Stack.back();
+        DEBUG(dbgs() << "Component member is " << *Member << "\n");
+        Component.insert(Member);
+        InComponent.insert(Member);
+        ValueToComponent[Member] = ComponentID;
+        Stack.pop_back();
+      }
+    } else {
+      // Part of a component, push to stack
+      Stack.push_back(I);
+    }
+  }
+  unsigned int DFSNum = 1;
+  SmallPtrSet<const Value *, 8> InComponent;
+  DenseMap<const Value *, unsigned int> Root;
+  SmallVector<const Value *, 8> Stack;
+  // Store the components as vector of ptr sets, because we need the topo order
+  // of SCC's, but not individual member order
+  SmallVector<SmallPtrSet<const Value *, 8>, 8> Components;
+  DenseMap<const Value *, unsigned> ValueToComponent;
+};
 // Congruence classes represent the set of expressions/instructions
 // that are all the same *during some scope in the function*.
 // That is, because of the way we perform equality propagation, and
@@ -115,43 +214,152 @@ PHIExpression::~PHIExpression() = default;
 // For any Value in the Member set, it is valid to replace any dominated member
 // with that Value.
 //
-// Every congruence class has a leader, and the leader is used to
-// symbolize instructions in a canonical way (IE every operand of an
-// instruction that is a member of the same congruence class will
-// always be replaced with leader during symbolization).
-// To simplify symbolization, we keep the leader as a constant if class can be
-// proved to be a constant value.
-// Otherwise, the leader is a randomly chosen member of the value set, it does
-// not matter which one is chosen.
-// Each congruence class also has a defining expression,
-// though the expression may be null.  If it exists, it can be used for forward
-// propagation and reassociation of values.
-//
-struct CongruenceClass {
-  using MemberSet = SmallPtrSet<Value *, 4>;
+// Every congruence class has a leader, and the leader is used to symbolize
+// instructions in a canonical way (IE every operand of an instruction that is a
+// member of the same congruence class will always be replaced with leader
+// during symbolization).  To simplify symbolization, we keep the leader as a
+// constant if class can be proved to be a constant value.  Otherwise, the
+// leader is the member of the value set with the smallest DFS number.  Each
+// congruence class also has a defining expression, though the expression may be
+// null.  If it exists, it can be used for forward propagation and reassociation
+// of values.
+
+// For memory, we also track a representative MemoryAccess, and a set of memory
+// members for MemoryPhis (which have no real instructions). Note that for
+// memory, it seems tempting to try to split the memory members into a
+// MemoryCongruenceClass or something.  Unfortunately, this does not work
+// easily.  The value numbering of a given memory expression depends on the
+// leader of the memory congruence class, and the leader of memory congruence
+// class depends on the value numbering of a given memory expression.  This
+// leads to wasted propagation, and in some cases, missed optimization.  For
+// example: If we had value numbered two stores together before, but now do not,
+// we move them to a new value congruence class.  This in turn will move at one
+// of the memorydefs to a new memory congruence class.  Which in turn, affects
+// the value numbering of the stores we just value numbered (because the memory
+// congruence class is part of the value number).  So while theoretically
+// possible to split them up, it turns out to be *incredibly* complicated to get
+// it to work right, because of the interdependency.  While structurally
+// slightly messier, it is algorithmically much simpler and faster to do what we
+// do here, and track them both at once in the same class.
+// Note: The default iterators for this class iterate over values
+class CongruenceClass {
+public:
+  using MemberType = Value;
+  using MemberSet = SmallPtrSet<MemberType *, 4>;
+  using MemoryMemberType = MemoryPhi;
+  using MemoryMemberSet = SmallPtrSet<const MemoryMemberType *, 2>;
+
+  explicit CongruenceClass(unsigned ID) : ID(ID) {}
+  CongruenceClass(unsigned ID, Value *Leader, const Expression *E)
+      : ID(ID), RepLeader(Leader), DefiningExpr(E) {}
+  unsigned getID() const { return ID; }
+  // True if this class has no members left.  This is mainly used for assertion
+  // purposes, and for skipping empty classes.
+  bool isDead() const {
+    // If it's both dead from a value perspective, and dead from a memory
+    // perspective, it's really dead.
+    return empty() && memory_empty();
+  }
+  // Leader functions
+  Value *getLeader() const { return RepLeader; }
+  void setLeader(Value *Leader) { RepLeader = Leader; }
+  const std::pair<Value *, unsigned int> &getNextLeader() const {
+    return NextLeader;
+  }
+  void resetNextLeader() { NextLeader = {nullptr, ~0}; }
+
+  void addPossibleNextLeader(std::pair<Value *, unsigned int> LeaderPair) {
+    if (LeaderPair.second < NextLeader.second)
+      NextLeader = LeaderPair;
+  }
+
+  Value *getStoredValue() const { return RepStoredValue; }
+  void setStoredValue(Value *Leader) { RepStoredValue = Leader; }
+  const MemoryAccess *getMemoryLeader() const { return RepMemoryAccess; }
+  void setMemoryLeader(const MemoryAccess *Leader) { RepMemoryAccess = Leader; }
+
+  // Forward propagation info
+  const Expression *getDefiningExpr() const { return DefiningExpr; }
+  void setDefiningExpr(const Expression *E) { DefiningExpr = E; }
+
+  // Value member set
+  bool empty() const { return Members.empty(); }
+  unsigned size() const { return Members.size(); }
+  MemberSet::const_iterator begin() const { return Members.begin(); }
+  MemberSet::const_iterator end() const { return Members.end(); }
+  void insert(MemberType *M) { Members.insert(M); }
+  void erase(MemberType *M) { Members.erase(M); }
+  void swap(MemberSet &Other) { Members.swap(Other); }
+
+  // Memory member set
+  bool memory_empty() const { return MemoryMembers.empty(); }
+  unsigned memory_size() const { return MemoryMembers.size(); }
+  MemoryMemberSet::const_iterator memory_begin() const {
+    return MemoryMembers.begin();
+  }
+  MemoryMemberSet::const_iterator memory_end() const {
+    return MemoryMembers.end();
+  }
+  iterator_range<MemoryMemberSet::const_iterator> memory() const {
+    return make_range(memory_begin(), memory_end());
+  }
+  void memory_insert(const MemoryMemberType *M) { MemoryMembers.insert(M); }
+  void memory_erase(const MemoryMemberType *M) { MemoryMembers.erase(M); }
+
+  // Store count
+  unsigned getStoreCount() const { return StoreCount; }
+  void incStoreCount() { ++StoreCount; }
+  void decStoreCount() {
+    assert(StoreCount != 0 && "Store count went negative");
+    --StoreCount;
+  }
+
+  // Return true if two congruence classes are equivalent to each other.  This
+  // means
+  // that every field but the ID number and the dead field are equivalent.
+  bool isEquivalentTo(const CongruenceClass *Other) const {
+    if (!Other)
+      return false;
+    if (this == Other)
+      return true;
+
+    if (std::tie(StoreCount, RepLeader, RepStoredValue, RepMemoryAccess) !=
+        std::tie(Other->StoreCount, Other->RepLeader, Other->RepStoredValue,
+                 Other->RepMemoryAccess))
+      return false;
+    if (DefiningExpr != Other->DefiningExpr)
+      if (!DefiningExpr || !Other->DefiningExpr ||
+          *DefiningExpr != *Other->DefiningExpr)
+        return false;
+    // We need some ordered set
+    std::set<Value *> AMembers(Members.begin(), Members.end());
+    std::set<Value *> BMembers(Members.begin(), Members.end());
+    return AMembers == BMembers;
+  }
+
+private:
   unsigned ID;
   // Representative leader.
   Value *RepLeader = nullptr;
+  // The most dominating leader after our current leader, because the member set
+  // is not sorted and is expensive to keep sorted all the time.
+  std::pair<Value *, unsigned int> NextLeader = {nullptr, ~0U};
+  // If this is represented by a store, the value of the store.
+  Value *RepStoredValue = nullptr;
+  // If this class contains MemoryDefs or MemoryPhis, this is the leading memory
+  // access.
+  const MemoryAccess *RepMemoryAccess = nullptr;
   // Defining Expression.
   const Expression *DefiningExpr = nullptr;
   // Actual members of this class.
   MemberSet Members;
-
-  // True if this class has no members left.  This is mainly used for assertion
-  // purposes, and for skipping empty classes.
-  bool Dead = false;
-
+  // This is the set of MemoryPhis that exist in the class. MemoryDefs and
+  // MemoryUses have real instructions representing them, so we only need to
+  // track MemoryPhis here.
+  MemoryMemberSet MemoryMembers;
   // Number of stores in this congruence class.
   // This is used so we can detect store equivalence changes properly.
   int StoreCount = 0;
-
-  // The most dominating leader after our current leader, because the member set
-  // is not sorted and is expensive to keep sorted all the time.
-  std::pair<Value *, unsigned int> NextLeader = {nullptr, ~0U};
-
-  explicit CongruenceClass(unsigned ID) : ID(ID) {}
-  CongruenceClass(unsigned ID, Value *Leader, const Expression *E)
-      : ID(ID), RepLeader(Leader), DefiningExpr(E) {}
 };
 
 namespace llvm {
@@ -180,19 +388,34 @@ template <> struct DenseMapInfo<const Expression *> {
 };
 } // end namespace llvm
 
-class NewGVN : public FunctionPass {
+namespace {
+class NewGVN {
+  Function &F;
   DominatorTree *DT;
-  const DataLayout *DL;
-  const TargetLibraryInfo *TLI;
   AssumptionCache *AC;
+  const TargetLibraryInfo *TLI;
   AliasAnalysis *AA;
   MemorySSA *MSSA;
   MemorySSAWalker *MSSAWalker;
+  const DataLayout &DL;
+  std::unique_ptr<PredicateInfo> PredInfo;
   BumpPtrAllocator ExpressionAllocator;
   ArrayRecycler<Value *> ArgRecycler;
+  TarjanSCC SCCFinder;
+
+  // Number of function arguments, used by ranking
+  unsigned int NumFuncArgs;
+
+  // RPOOrdering of basic blocks
+  DenseMap<const DomTreeNode *, unsigned> RPOOrdering;
 
   // Congruence class info.
-  CongruenceClass *InitialClass;
+
+  // This class is called INITIAL in the paper. It is the class everything
+  // startsout in, and represents any value. Being an optimistic analysis,
+  // anything in the TOP class has the value TOP, which is indeterminate and
+  // equivalent to everything.
+  CongruenceClass *TOPClass;
   std::vector<CongruenceClass *> CongruenceClasses;
   unsigned NextCongruenceNum;
 
@@ -200,13 +423,38 @@ class NewGVN : public FunctionPass {
   DenseMap<Value *, CongruenceClass *> ValueToClass;
   DenseMap<Value *, const Expression *> ValueToExpression;
 
+  // Mapping from predicate info we used to the instructions we used it with.
+  // In order to correctly ensure propagation, we must keep track of what
+  // comparisons we used, so that when the values of the comparisons change, we
+  // propagate the information to the places we used the comparison.
+  DenseMap<const Value *, SmallPtrSet<Instruction *, 2>> PredicateToUsers;
+  // Mapping from MemoryAccess we used to the MemoryAccess we used it with.  Has
+  // the same reasoning as PredicateToUsers.  When we skip MemoryAccesses for
+  // stores, we no longer can rely solely on the def-use chains of MemorySSA.
+  DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>> MemoryToUsers;
+
   // A table storing which memorydefs/phis represent a memory state provably
   // equivalent to another memory state.
   // We could use the congruence class machinery, but the MemoryAccess's are
   // abstract memory states, so they can only ever be equivalent to each other,
   // and not to constants, etc.
-  DenseMap<const MemoryAccess *, MemoryAccess *> MemoryAccessEquiv;
-
+  DenseMap<const MemoryAccess *, CongruenceClass *> MemoryAccessToClass;
+
+  // We could, if we wanted, build MemoryPhiExpressions and
+  // MemoryVariableExpressions, etc, and value number them the same way we value
+  // number phi expressions.  For the moment, this seems like overkill.  They
+  // can only exist in one of three states: they can be TOP (equal to
+  // everything), Equivalent to something else, or unique.  Because we do not
+  // create expressions for them, we need to simulate leader change not just
+  // when they change class, but when they change state.  Note: We can do the
+  // same thing for phis, and avoid having phi expressions if we wanted, We
+  // should eventually unify in one direction or the other, so this is a little
+  // bit of an experiment in which turns out easier to maintain.
+  enum MemoryPhiState { MPS_Invalid, MPS_TOP, MPS_Equivalent, MPS_Unique };
+  DenseMap<const MemoryPhi *, MemoryPhiState> MemoryPhiState;
+
+  enum PhiCycleState { PCS_Unknown, PCS_CycleFree, PCS_Cycle };
+  DenseMap<const PHINode *, PhiCycleState> PhiCycleState;
   // Expression to class mapping.
   using ExpressionClassMap = DenseMap<const Expression *, CongruenceClass *>;
   ExpressionClassMap ExpressionToClass;
@@ -231,8 +479,6 @@ class NewGVN : public FunctionPass {
   BitVector TouchedInstructions;
 
   DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange;
-  DenseMap<const DomTreeNode *, std::pair<unsigned, unsigned>>
-      DominatedInstRange;
 
 #ifndef NDEBUG
   // Debugging for how many times each block and instruction got processed.
@@ -240,56 +486,42 @@ class NewGVN : public FunctionPass {
 #endif
 
   // DFS info.
-  DenseMap<const BasicBlock *, std::pair<int, int>> DFSDomMap;
+  // This contains a mapping from Instructions to DFS numbers.
+  // The numbering starts at 1. An instruction with DFS number zero
+  // means that the instruction is dead.
   DenseMap<const Value *, unsigned> InstrDFS;
+
+  // This contains the mapping DFS numbers to instructions.
   SmallVector<Value *, 32> DFSToInstr;
 
   // Deletion info.
   SmallPtrSet<Instruction *, 8> InstructionsToErase;
 
 public:
-  static char ID; // Pass identification, replacement for typeid.
-  NewGVN() : FunctionPass(ID) {
-    initializeNewGVNPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnFunction(Function &F) override;
-  bool runGVN(Function &F, DominatorTree *DT, AssumptionCache *AC,
-              TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA);
+  NewGVN(Function &F, DominatorTree *DT, AssumptionCache *AC,
+         TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA,
+         const DataLayout &DL)
+      : F(F), DT(DT), AC(AC), TLI(TLI), AA(AA), MSSA(MSSA), DL(DL),
+        PredInfo(make_unique<PredicateInfo>(F, *DT, *AC)) {}
+  bool runGVN();
 
 private:
-  // This transformation requires dominator postdominator info.
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<AssumptionCacheTracker>();
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
-    AU.addRequired<MemorySSAWrapperPass>();
-    AU.addRequired<AAResultsWrapperPass>();
-
-    AU.addPreserved<DominatorTreeWrapperPass>();
-    AU.addPreserved<GlobalsAAWrapperPass>();
-  }
-
   // Expression handling.
-  const Expression *createExpression(Instruction *, const BasicBlock *);
-  const Expression *createBinaryExpression(unsigned, Type *, Value *, Value *,
-                                           const BasicBlock *);
-  PHIExpression *createPHIExpression(Instruction *);
+  const Expression *createExpression(Instruction *);
+  const Expression *createBinaryExpression(unsigned, Type *, Value *, Value *);
+  PHIExpression *createPHIExpression(Instruction *, bool &HasBackedge,
+                                     bool &AllConstant);
   const VariableExpression *createVariableExpression(Value *);
   const ConstantExpression *createConstantExpression(Constant *);
-  const Expression *createVariableOrConstant(Value *V, const BasicBlock *B);
+  const Expression *createVariableOrConstant(Value *V);
   const UnknownExpression *createUnknownExpression(Instruction *);
-  const StoreExpression *createStoreExpression(StoreInst *, MemoryAccess *,
-                                               const BasicBlock *);
+  const StoreExpression *createStoreExpression(StoreInst *,
+                                               const MemoryAccess *);
   LoadExpression *createLoadExpression(Type *, Value *, LoadInst *,
-                                       MemoryAccess *, const BasicBlock *);
-
-  const CallExpression *createCallExpression(CallInst *, MemoryAccess *,
-                                             const BasicBlock *);
-  const AggregateValueExpression *
-  createAggregateValueExpression(Instruction *, const BasicBlock *);
-  bool setBasicExpressionInfo(Instruction *, BasicExpression *,
-                              const BasicBlock *);
+                                       const MemoryAccess *);
+  const CallExpression *createCallExpression(CallInst *, const MemoryAccess *);
+  const AggregateValueExpression *createAggregateValueExpression(Instruction *);
+  bool setBasicExpressionInfo(Instruction *, BasicExpression *);
 
   // Congruence class handling.
   CongruenceClass *createCongruenceClass(Value *Leader, const Expression *E) {
@@ -298,9 +530,21 @@ private:
     return result;
   }
 
+  CongruenceClass *createMemoryClass(MemoryAccess *MA) {
+    auto *CC = createCongruenceClass(nullptr, nullptr);
+    CC->setMemoryLeader(MA);
+    return CC;
+  }
+  CongruenceClass *ensureLeaderOfMemoryClass(MemoryAccess *MA) {
+    auto *CC = getMemoryClass(MA);
+    if (CC->getMemoryLeader() != MA)
+      CC = createMemoryClass(MA);
+    return CC;
+  }
+
   CongruenceClass *createSingletonCongruenceClass(Value *Member) {
     CongruenceClass *CClass = createCongruenceClass(Member, nullptr);
-    CClass->Members.insert(Member);
+    CClass->insert(Member);
     ValueToClass[Member] = CClass;
     return CClass;
   }
@@ -313,37 +557,49 @@ private:
   // Symbolic evaluation.
   const Expression *checkSimplificationResults(Expression *, Instruction *,
                                                Value *);
-  const Expression *performSymbolicEvaluation(Value *, const BasicBlock *);
-  const Expression *performSymbolicLoadEvaluation(Instruction *,
-                                                  const BasicBlock *);
-  const Expression *performSymbolicStoreEvaluation(Instruction *,
-                                                   const BasicBlock *);
-  const Expression *performSymbolicCallEvaluation(Instruction *,
-                                                  const BasicBlock *);
-  const Expression *performSymbolicPHIEvaluation(Instruction *,
-                                                 const BasicBlock *);
-  bool setMemoryAccessEquivTo(MemoryAccess *From, MemoryAccess *To);
-  const Expression *performSymbolicAggrValueEvaluation(Instruction *,
-                                                       const BasicBlock *);
+  const Expression *performSymbolicEvaluation(Value *);
+  const Expression *performSymbolicLoadCoercion(Type *, Value *, LoadInst *,
+                                                Instruction *, MemoryAccess *);
+  const Expression *performSymbolicLoadEvaluation(Instruction *);
+  const Expression *performSymbolicStoreEvaluation(Instruction *);
+  const Expression *performSymbolicCallEvaluation(Instruction *);
+  const Expression *performSymbolicPHIEvaluation(Instruction *);
+  const Expression *performSymbolicAggrValueEvaluation(Instruction *);
+  const Expression *performSymbolicCmpEvaluation(Instruction *);
+  const Expression *performSymbolicPredicateInfoEvaluation(Instruction *);
 
   // Congruence finding.
-  // Templated to allow them to work both on BB's and BB-edges.
-  template <class T>
-  Value *lookupOperandLeader(Value *, const User *, const T &) const;
+  bool someEquivalentDominates(const Instruction *, const Instruction *) const;
+  Value *lookupOperandLeader(Value *) const;
   void performCongruenceFinding(Instruction *, const Expression *);
-  void moveValueToNewCongruenceClass(Instruction *, CongruenceClass *,
-                                     CongruenceClass *);
+  void moveValueToNewCongruenceClass(Instruction *, const Expression *,
+                                     CongruenceClass *, CongruenceClass *);
+  void moveMemoryToNewCongruenceClass(Instruction *, MemoryAccess *,
+                                      CongruenceClass *, CongruenceClass *);
+  Value *getNextValueLeader(CongruenceClass *) const;
+  const MemoryAccess *getNextMemoryLeader(CongruenceClass *) const;
+  bool setMemoryClass(const MemoryAccess *From, CongruenceClass *To);
+  CongruenceClass *getMemoryClass(const MemoryAccess *MA) const;
+  const MemoryAccess *lookupMemoryLeader(const MemoryAccess *) const;
+  bool isMemoryAccessTop(const MemoryAccess *) const;
+
+  // Ranking
+  unsigned int getRank(const Value *) const;
+  bool shouldSwapOperands(const Value *, const Value *) const;
+
   // Reachability handling.
   void updateReachableEdge(BasicBlock *, BasicBlock *);
   void processOutgoingEdges(TerminatorInst *, BasicBlock *);
-  bool isOnlyReachableViaThisEdge(const BasicBlockEdge &) const;
-  Value *findConditionEquivalence(Value *, BasicBlock *) const;
-  MemoryAccess *lookupMemoryAccessEquiv(MemoryAccess *) const;
+  Value *findConditionEquivalence(Value *) const;
 
   // Elimination.
   struct ValueDFS;
-  void convertDenseToDFSOrdered(CongruenceClass::MemberSet &,
-                                SmallVectorImpl<ValueDFS> &);
+  void convertClassToDFSOrdered(const CongruenceClass &,
+                                SmallVectorImpl<ValueDFS> &,
+                                DenseMap<const Value *, unsigned int> &,
+                                SmallPtrSetImpl<Instruction *> &) const;
+  void convertClassToLoadsAndStores(const CongruenceClass &,
+                                    SmallVectorImpl<ValueDFS> &) const;
 
   bool eliminateInstructions(Function &);
   void replaceInstruction(Instruction *, Value *);
@@ -355,35 +611,58 @@ private:
 
   // Various instruction touch utilities
   void markUsersTouched(Value *);
-  void markMemoryUsersTouched(MemoryAccess *);
-  void markLeaderChangeTouched(CongruenceClass *CC);
+  void markMemoryUsersTouched(const MemoryAccess *);
+  void markMemoryDefTouched(const MemoryAccess *);
+  void markPredicateUsersTouched(Instruction *);
+  void markValueLeaderChangeTouched(CongruenceClass *CC);
+  void markMemoryLeaderChangeTouched(CongruenceClass *CC);
+  void addPredicateUsers(const PredicateBase *, Instruction *);
+  void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U);
+
+  // Main loop of value numbering
+  void iterateTouchedInstructions();
 
   // Utilities.
   void cleanupTables();
   std::pair<unsigned, unsigned> assignDFSNumbers(BasicBlock *, unsigned);
   void updateProcessedCount(Value *V);
   void verifyMemoryCongruency() const;
+  void verifyIterationSettled(Function &F);
   bool singleReachablePHIPath(const MemoryAccess *, const MemoryAccess *) const;
-};
-
-char NewGVN::ID = 0;
+  BasicBlock *getBlockForValue(Value *V) const;
+  void deleteExpression(const Expression *E);
+  unsigned InstrToDFSNum(const Value *V) const {
+    assert(isa<Instruction>(V) && "This should not be used for MemoryAccesses");
+    return InstrDFS.lookup(V);
+  }
 
-// createGVNPass - The public interface to this file.
-FunctionPass *llvm::createNewGVNPass() { return new NewGVN(); }
+  unsigned InstrToDFSNum(const MemoryAccess *MA) const {
+    return MemoryToDFSNum(MA);
+  }
+  Value *InstrFromDFSNum(unsigned DFSNum) { return DFSToInstr[DFSNum]; }
+  // Given a MemoryAccess, return the relevant instruction DFS number.  Note:
+  // This deliberately takes a value so it can be used with Use's, which will
+  // auto-convert to Value's but not to MemoryAccess's.
+  unsigned MemoryToDFSNum(const Value *MA) const {
+    assert(isa<MemoryAccess>(MA) &&
+           "This should not be used with instructions");
+    return isa<MemoryUseOrDef>(MA)
+               ? InstrToDFSNum(cast<MemoryUseOrDef>(MA)->getMemoryInst())
+               : InstrDFS.lookup(MA);
+  }
+  bool isCycleFree(const PHINode *PN);
+  template <class T, class Range> T *getMinDFSOfRange(const Range &) const;
+  // Debug counter info.  When verifying, we have to reset the value numbering
+  // debug counter to the same state it started in to get the same results.
+  std::pair<int, int> StartingVNCounter;
+};
+} // end anonymous namespace
 
 template <typename T>
 static bool equalsLoadStoreHelper(const T &LHS, const Expression &RHS) {
-  if ((!isa<LoadExpression>(RHS) && !isa<StoreExpression>(RHS)) ||
-      !LHS.BasicExpression::equals(RHS)) {
+  if (!isa<LoadExpression>(RHS) && !isa<StoreExpression>(RHS))
     return false;
-  } else if (const auto *L = dyn_cast<LoadExpression>(&RHS)) {
-    if (LHS.getDefiningAccess() != L->getDefiningAccess())
-      return false;
-  } else if (const auto *S = dyn_cast<StoreExpression>(&RHS)) {
-    if (LHS.getDefiningAccess() != S->getDefiningAccess())
-      return false;
-  }
-  return true;
+  return LHS.MemoryExpression::equals(RHS);
 }
 
 bool LoadExpression::equals(const Expression &Other) const {
@@ -391,7 +670,13 @@ bool LoadExpression::equals(const Expression &Other) const {
 }
 
 bool StoreExpression::equals(const Expression &Other) const {
-  return equalsLoadStoreHelper(*this, Other);
+  if (!equalsLoadStoreHelper(*this, Other))
+    return false;
+  // Make sure that store vs store includes the value operand.
+  if (const auto *S = dyn_cast<StoreExpression>(&Other))
+    if (getStoredValue() != S->getStoredValue())
+      return false;
+  return true;
 }
 
 #ifndef NDEBUG
@@ -400,16 +685,28 @@ static std::string getBlockName(const BasicBlock *B) {
 }
 #endif
 
-INITIALIZE_PASS_BEGIN(NewGVN, "newgvn", "Global Value Numbering", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_END(NewGVN, "newgvn", "Global Value Numbering", false, false)
+// Get the basic block from an instruction/memory value.
+BasicBlock *NewGVN::getBlockForValue(Value *V) const {
+  if (auto *I = dyn_cast<Instruction>(V))
+    return I->getParent();
+  else if (auto *MP = dyn_cast<MemoryPhi>(V))
+    return MP->getBlock();
+  llvm_unreachable("Should have been able to figure out a block for our value");
+  return nullptr;
+}
 
-PHIExpression *NewGVN::createPHIExpression(Instruction *I) {
+// Delete a definitely dead expression, so it can be reused by the expression
+// allocator.  Some of these are not in creation functions, so we have to accept
+// const versions.
+void NewGVN::deleteExpression(const Expression *E) {
+  assert(isa<BasicExpression>(E));
+  auto *BE = cast<BasicExpression>(E);
+  const_cast<BasicExpression *>(BE)->deallocateOperands(ArgRecycler);
+  ExpressionAllocator.Deallocate(E);
+}
+
+PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge,
+                                           bool &AllConstant) {
   BasicBlock *PHIBlock = I->getParent();
   auto *PN = cast<PHINode>(I);
   auto *E =
@@ -419,28 +716,32 @@ PHIExpression *NewGVN::createPHIExpression(Instruction *I) {
   E->setType(I->getType());
   E->setOpcode(I->getOpcode());
 
-  auto ReachablePhiArg = [&](const Use &U) {
-    return ReachableBlocks.count(PN->getIncomingBlock(U));
-  };
+  unsigned PHIRPO = RPOOrdering.lookup(DT->getNode(PHIBlock));
 
-  // Filter out unreachable operands
-  auto Filtered = make_filter_range(PN->operands(), ReachablePhiArg);
+  // Filter out unreachable phi operands.
+  auto Filtered = make_filter_range(PN->operands(), [&](const Use &U) {
+    return ReachableEdges.count({PN->getIncomingBlock(U), PHIBlock});
+  });
 
   std::transform(Filtered.begin(), Filtered.end(), op_inserter(E),
                  [&](const Use &U) -> Value * {
+                   auto *BB = PN->getIncomingBlock(U);
+                   auto *DTN = DT->getNode(BB);
+                   if (RPOOrdering.lookup(DTN) >= PHIRPO)
+                     HasBackedge = true;
+                   AllConstant &= isa<UndefValue>(U) || isa<Constant>(U);
+
                    // Don't try to transform self-defined phis.
                    if (U == PN)
                      return PN;
-                   const BasicBlockEdge BBE(PN->getIncomingBlock(U), PHIBlock);
-                   return lookupOperandLeader(U, I, BBE);
+                   return lookupOperandLeader(U);
                  });
   return E;
 }
 
 // Set basic expression info (Arguments, type, opcode) for Expression
 // E from Instruction I in block B.
-bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E,
-                                    const BasicBlock *B) {
+bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) {
   bool AllConstant = true;
   if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
     E->setType(GEP->getSourceElementType());
@@ -452,7 +753,7 @@ bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E,
   // Transform the operand array into an operand leader array, and keep track of
   // whether all members are constant.
   std::transform(I->op_begin(), I->op_end(), op_inserter(E), [&](Value *O) {
-    auto Operand = lookupOperandLeader(O, I, B);
+    auto Operand = lookupOperandLeader(O);
     AllConstant &= isa<Constant>(Operand);
     return Operand;
   });
@@ -461,8 +762,7 @@ bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E,
 }
 
 const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
-                                                 Value *Arg1, Value *Arg2,
-                                                 const BasicBlock *B) {
+                                                 Value *Arg1, Value *Arg2) {
   auto *E = new (ExpressionAllocator) BasicExpression(2);
 
   E->setType(T);
@@ -473,13 +773,13 @@ const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
     // of their operands get the same value number by sorting the operand value
     // numbers.  Since all commutative instructions have two operands it is more
     // efficient to sort by hand rather than using, say, std::sort.
-    if (Arg1 > Arg2)
+    if (shouldSwapOperands(Arg1, Arg2))
       std::swap(Arg1, Arg2);
   }
-  E->op_push_back(lookupOperandLeader(Arg1, nullptr, B));
-  E->op_push_back(lookupOperandLeader(Arg2, nullptr, B));
+  E->op_push_back(lookupOperandLeader(Arg1));
+  E->op_push_back(lookupOperandLeader(Arg2));
 
-  Value *V = SimplifyBinOp(Opcode, E->getOperand(0), E->getOperand(1), *DL, TLI,
+  Value *V = SimplifyBinOp(Opcode, E->getOperand(0), E->getOperand(1), DL, TLI,
                            DT, AC);
   if (const Expression *SimplifiedE = checkSimplificationResults(E, nullptr, V))
     return SimplifiedE;
@@ -502,40 +802,32 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E,
     NumGVNOpsSimplified++;
     assert(isa<BasicExpression>(E) &&
            "We should always have had a basic expression here");
-
-    cast<BasicExpression>(E)->deallocateOperands(ArgRecycler);
-    ExpressionAllocator.Deallocate(E);
+    deleteExpression(E);
     return createConstantExpression(C);
   } else if (isa<Argument>(V) || isa<GlobalVariable>(V)) {
     if (I)
       DEBUG(dbgs() << "Simplified " << *I << " to "
                    << " variable " << *V << "\n");
-    cast<BasicExpression>(E)->deallocateOperands(ArgRecycler);
-    ExpressionAllocator.Deallocate(E);
+    deleteExpression(E);
     return createVariableExpression(V);
   }
 
   CongruenceClass *CC = ValueToClass.lookup(V);
-  if (CC && CC->DefiningExpr) {
+  if (CC && CC->getDefiningExpr()) {
     if (I)
       DEBUG(dbgs() << "Simplified " << *I << " to "
                    << " expression " << *V << "\n");
     NumGVNOpsSimplified++;
-    assert(isa<BasicExpression>(E) &&
-           "We should always have had a basic expression here");
-    cast<BasicExpression>(E)->deallocateOperands(ArgRecycler);
-    ExpressionAllocator.Deallocate(E);
-    return CC->DefiningExpr;
+    deleteExpression(E);
+    return CC->getDefiningExpr();
   }
   return nullptr;
 }
 
-const Expression *NewGVN::createExpression(Instruction *I,
-                                           const BasicBlock *B) {
-
+const Expression *NewGVN::createExpression(Instruction *I) {
   auto *E = new (ExpressionAllocator) BasicExpression(I->getNumOperands());
 
-  bool AllConstant = setBasicExpressionInfo(I, E, B);
+  bool AllConstant = setBasicExpressionInfo(I, E);
 
   if (I->isCommutative()) {
     // Ensure that commutative instructions that only differ by a permutation
@@ -543,7 +835,7 @@ const Expression *NewGVN::createExpression(Instruction *I,
     // numbers.  Since all commutative instructions have two operands it is more
     // efficient to sort by hand rather than using, say, std::sort.
     assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!");
-    if (E->getOperand(0) > E->getOperand(1))
+    if (shouldSwapOperands(E->getOperand(0), E->getOperand(1)))
       E->swapOperands(0, 1);
   }
 
@@ -559,48 +851,43 @@ const Expression *NewGVN::createExpression(Instruction *I,
     // Sort the operand value numbers so x<y and y>x get the same value
     // number.
     CmpInst::Predicate Predicate = CI->getPredicate();
-    if (E->getOperand(0) > E->getOperand(1)) {
+    if (shouldSwapOperands(E->getOperand(0), E->getOperand(1))) {
       E->swapOperands(0, 1);
       Predicate = CmpInst::getSwappedPredicate(Predicate);
     }
     E->setOpcode((CI->getOpcode() << 8) | Predicate);
     // TODO: 25% of our time is spent in SimplifyCmpInst with pointer operands
-    // TODO: Since we noop bitcasts, we may need to check types before
-    // simplifying, so that we don't end up simplifying based on a wrong
-    // type assumption. We should clean this up so we can use constants of the
-    // wrong type
-
     assert(I->getOperand(0)->getType() == I->getOperand(1)->getType() &&
            "Wrong types on cmp instruction");
-    if ((E->getOperand(0)->getType() == I->getOperand(0)->getType() &&
-         E->getOperand(1)->getType() == I->getOperand(1)->getType())) {
-      Value *V = SimplifyCmpInst(Predicate, E->getOperand(0), E->getOperand(1),
-                                 *DL, TLI, DT, AC);
-      if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
-        return SimplifiedE;
-    }
+    assert((E->getOperand(0)->getType() == I->getOperand(0)->getType() &&
+            E->getOperand(1)->getType() == I->getOperand(1)->getType()));
+    Value *V = SimplifyCmpInst(Predicate, E->getOperand(0), E->getOperand(1),
+                               DL, TLI, DT, AC);
+    if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+      return SimplifiedE;
   } else if (isa<SelectInst>(I)) {
     if (isa<Constant>(E->getOperand(0)) ||
-        (E->getOperand(1)->getType() == I->getOperand(1)->getType() &&
-         E->getOperand(2)->getType() == I->getOperand(2)->getType())) {
+        E->getOperand(0) == E->getOperand(1)) {
+      assert(E->getOperand(1)->getType() == I->getOperand(1)->getType() &&
+             E->getOperand(2)->getType() == I->getOperand(2)->getType());
       Value *V = SimplifySelectInst(E->getOperand(0), E->getOperand(1),
-                                    E->getOperand(2), *DL, TLI, DT, AC);
+                                    E->getOperand(2), DL, TLI, DT, AC);
       if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
         return SimplifiedE;
     }
   } else if (I->isBinaryOp()) {
     Value *V = SimplifyBinOp(E->getOpcode(), E->getOperand(0), E->getOperand(1),
-                             *DL, TLI, DT, AC);
+                             DL, TLI, DT, AC);
     if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
       return SimplifiedE;
   } else if (auto *BI = dyn_cast<BitCastInst>(I)) {
-    Value *V = SimplifyInstruction(BI, *DL, TLI, DT, AC);
+    Value *V = SimplifyInstruction(BI, DL, TLI, DT, AC);
     if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
       return SimplifiedE;
   } else if (isa<GetElementPtrInst>(I)) {
     Value *V = SimplifyGEPInst(E->getType(),
                                ArrayRef<Value *>(E->op_begin(), E->op_end()),
-                               *DL, TLI, DT, AC);
+                               DL, TLI, DT, AC);
     if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
       return SimplifiedE;
   } else if (AllConstant) {
@@ -615,7 +902,7 @@ const Expression *NewGVN::createExpression(Instruction *I,
     for (Value *Arg : E->operands())
       C.emplace_back(cast<Constant>(Arg));
 
-    if (Value *V = ConstantFoldInstOperands(I, C, *DL, TLI))
+    if (Value *V = ConstantFoldInstOperands(I, C, DL, TLI))
       if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
         return SimplifiedE;
   }
@@ -623,18 +910,18 @@ const Expression *NewGVN::createExpression(Instruction *I,
 }
 
 const AggregateValueExpression *
-NewGVN::createAggregateValueExpression(Instruction *I, const BasicBlock *B) {
+NewGVN::createAggregateValueExpression(Instruction *I) {
   if (auto *II = dyn_cast<InsertValueInst>(I)) {
     auto *E = new (ExpressionAllocator)
         AggregateValueExpression(I->getNumOperands(), II->getNumIndices());
-    setBasicExpressionInfo(I, E, B);
+    setBasicExpressionInfo(I, E);
     E->allocateIntOperands(ExpressionAllocator);
     std::copy(II->idx_begin(), II->idx_end(), int_op_inserter(E));
     return E;
   } else if (auto *EI = dyn_cast<ExtractValueInst>(I)) {
     auto *E = new (ExpressionAllocator)
         AggregateValueExpression(I->getNumOperands(), EI->getNumIndices());
-    setBasicExpressionInfo(EI, E, B);
+    setBasicExpressionInfo(EI, E);
     E->allocateIntOperands(ExpressionAllocator);
     std::copy(EI->idx_begin(), EI->idx_end(), int_op_inserter(E));
     return E;
@@ -648,12 +935,10 @@ const VariableExpression *NewGVN::createVariableExpression(Value *V) {
   return E;
 }
 
-const Expression *NewGVN::createVariableOrConstant(Value *V,
-                                                   const BasicBlock *B) {
-  auto Leader = lookupOperandLeader(V, nullptr, B);
-  if (auto *C = dyn_cast<Constant>(Leader))
+const Expression *NewGVN::createVariableOrConstant(Value *V) {
+  if (auto *C = dyn_cast<Constant>(V))
     return createConstantExpression(C);
-  return createVariableExpression(Leader);
+  return createVariableExpression(V);
 }
 
 const ConstantExpression *NewGVN::createConstantExpression(Constant *C) {
@@ -669,40 +954,90 @@ const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) {
 }
 
 const CallExpression *NewGVN::createCallExpression(CallInst *CI,
-                                                   MemoryAccess *HV,
-                                                   const BasicBlock *B) {
+                                                   const MemoryAccess *MA) {
   // FIXME: Add operand bundles for calls.
   auto *E =
-      new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, HV);
-  setBasicExpressionInfo(CI, E, B);
+      new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, MA);
+  setBasicExpressionInfo(CI, E);
   return E;
 }
 
+// Return true if some equivalent of instruction Inst dominates instruction U.
+bool NewGVN::someEquivalentDominates(const Instruction *Inst,
+                                     const Instruction *U) const {
+  auto *CC = ValueToClass.lookup(Inst);
+  // This must be an instruction because we are only called from phi nodes
+  // in the case that the value it needs to check against is an instruction.
+
+  // The most likely candiates for dominance are the leader and the next leader.
+  // The leader or nextleader will dominate in all cases where there is an
+  // equivalent that is higher up in the dom tree.
+  // We can't *only* check them, however, because the
+  // dominator tree could have an infinite number of non-dominating siblings
+  // with instructions that are in the right congruence class.
+  //       A
+  // B C D E F G
+  // |
+  // H
+  // Instruction U could be in H,  with equivalents in every other sibling.
+  // Depending on the rpo order picked, the leader could be the equivalent in
+  // any of these siblings.
+  if (!CC)
+    return false;
+  if (DT->dominates(cast<Instruction>(CC->getLeader()), U))
+    return true;
+  if (CC->getNextLeader().first &&
+      DT->dominates(cast<Instruction>(CC->getNextLeader().first), U))
+    return true;
+  return llvm::any_of(*CC, [&](const Value *Member) {
+    return Member != CC->getLeader() &&
+           DT->dominates(cast<Instruction>(Member), U);
+  });
+}
+
 // See if we have a congruence class and leader for this operand, and if so,
 // return it. Otherwise, return the operand itself.
-template <class T>
-Value *NewGVN::lookupOperandLeader(Value *V, const User *U, const T &B) const {
+Value *NewGVN::lookupOperandLeader(Value *V) const {
   CongruenceClass *CC = ValueToClass.lookup(V);
-  if (CC && (CC != InitialClass))
-    return CC->RepLeader;
+  if (CC) {
+    // Everything in TOP is represneted by undef, as it can be any value.
+    // We do have to make sure we get the type right though, so we can't set the
+    // RepLeader to undef.
+    if (CC == TOPClass)
+      return UndefValue::get(V->getType());
+    return CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader();
+  }
+
   return V;
 }
 
-MemoryAccess *NewGVN::lookupMemoryAccessEquiv(MemoryAccess *MA) const {
-  MemoryAccess *Result = MemoryAccessEquiv.lookup(MA);
-  return Result ? Result : MA;
+const MemoryAccess *NewGVN::lookupMemoryLeader(const MemoryAccess *MA) const {
+  auto *CC = getMemoryClass(MA);
+  assert(CC->getMemoryLeader() &&
+         "Every MemoryAccess should be mapped to a "
+         "congruence class with a represenative memory "
+         "access");
+  return CC->getMemoryLeader();
+}
+
+// Return true if the MemoryAccess is really equivalent to everything. This is
+// equivalent to the lattice value "TOP" in most lattices.  This is the initial
+// state of all MemoryAccesses.
+bool NewGVN::isMemoryAccessTop(const MemoryAccess *MA) const {
+  return getMemoryClass(MA) == TOPClass;
 }
 
 LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
-                                             LoadInst *LI, MemoryAccess *DA,
-                                             const BasicBlock *B) {
-  auto *E = new (ExpressionAllocator) LoadExpression(1, LI, DA);
+                                             LoadInst *LI,
+                                             const MemoryAccess *MA) {
+  auto *E =
+      new (ExpressionAllocator) LoadExpression(1, LI, lookupMemoryLeader(MA));
   E->allocateOperands(ArgRecycler, ExpressionAllocator);
   E->setType(LoadType);
 
   // Give store and loads same opcode so they value number together.
   E->setOpcode(0);
-  E->op_push_back(lookupOperandLeader(PointerOp, LI, B));
+  E->op_push_back(PointerOp);
   if (LI)
     E->setAlignment(LI->getAlignment());
 
@@ -713,16 +1048,16 @@ LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
 }
 
 const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI,
-                                                     MemoryAccess *DA,
-                                                     const BasicBlock *B) {
-  auto *E =
-      new (ExpressionAllocator) StoreExpression(SI->getNumOperands(), SI, DA);
+                                                     const MemoryAccess *MA) {
+  auto *StoredValueLeader = lookupOperandLeader(SI->getValueOperand());
+  auto *E = new (ExpressionAllocator)
+      StoreExpression(SI->getNumOperands(), SI, StoredValueLeader, MA);
   E->allocateOperands(ArgRecycler, ExpressionAllocator);
   E->setType(SI->getValueOperand()->getType());
 
   // Give store and loads same opcode so they value number together.
   E->setOpcode(0);
-  E->op_push_back(lookupOperandLeader(SI->getPointerOperand(), SI, B));
+  E->op_push_back(lookupOperandLeader(SI->getPointerOperand()));
 
   // TODO: Value number heap versions. We may be able to discover
   // things alias analysis can't on it's own (IE that a store and a
@@ -730,44 +1065,140 @@ const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI,
   return E;
 }
 
-// Utility function to check whether the congruence class has a member other
-// than the given instruction.
-bool hasMemberOtherThanUs(const CongruenceClass *CC, Instruction *I) {
-  // Either it has more than one store, in which case it must contain something
-  // other than us (because it's indexed by value), or if it only has one store
-  // right now, that member should not be us.
-  return CC->StoreCount > 1 || CC->Members.count(I) == 0;
-}
-
-const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I,
-                                                         const BasicBlock *B) {
+const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) {
   // Unlike loads, we never try to eliminate stores, so we do not check if they
   // are simple and avoid value numbering them.
   auto *SI = cast<StoreInst>(I);
-  MemoryAccess *StoreAccess = MSSA->getMemoryAccess(SI);
-  // See if we are defined by a previous store expression, it already has a
-  // value, and it's the same value as our current store. FIXME: Right now, we
-  // only do this for simple stores, we should expand to cover memcpys, etc.
+  auto *StoreAccess = MSSA->getMemoryAccess(SI);
+  // Get the expression, if any, for the RHS of the MemoryDef.
+  const MemoryAccess *StoreRHS = StoreAccess->getDefiningAccess();
+  if (EnableStoreRefinement)
+    StoreRHS = MSSAWalker->getClobberingMemoryAccess(StoreAccess);
+  // If we bypassed the use-def chains, make sure we add a use.
+  if (StoreRHS != StoreAccess->getDefiningAccess())
+    addMemoryUsers(StoreRHS, StoreAccess);
+
+  StoreRHS = lookupMemoryLeader(StoreRHS);
+  // If we are defined by ourselves, use the live on entry def.
+  if (StoreRHS == StoreAccess)
+    StoreRHS = MSSA->getLiveOnEntryDef();
+
   if (SI->isSimple()) {
-    // Get the expression, if any, for the RHS of the MemoryDef.
-    MemoryAccess *StoreRHS = lookupMemoryAccessEquiv(
-        cast<MemoryDef>(StoreAccess)->getDefiningAccess());
-    const Expression *OldStore = createStoreExpression(SI, StoreRHS, B);
-    CongruenceClass *CC = ExpressionToClass.lookup(OldStore);
+    // See if we are defined by a previous store expression, it already has a
+    // value, and it's the same value as our current store. FIXME: Right now, we
+    // only do this for simple stores, we should expand to cover memcpys, etc.
+    const auto *LastStore = createStoreExpression(SI, StoreRHS);
+    const auto *LastCC = ExpressionToClass.lookup(LastStore);
     // Basically, check if the congruence class the store is in is defined by a
     // store that isn't us, and has the same value.  MemorySSA takes care of
     // ensuring the store has the same memory state as us already.
-    if (CC && CC->DefiningExpr && isa<StoreExpression>(CC->DefiningExpr) &&
-        CC->RepLeader == lookupOperandLeader(SI->getValueOperand(), SI, B) &&
-        hasMemberOtherThanUs(CC, I))
-      return createStoreExpression(SI, StoreRHS, B);
+    // The RepStoredValue gets nulled if all the stores disappear in a class, so
+    // we don't need to check if the class contains a store besides us.
+    if (LastCC &&
+        LastCC->getStoredValue() == lookupOperandLeader(SI->getValueOperand()))
+      return LastStore;
+    deleteExpression(LastStore);
+    // Also check if our value operand is defined by a load of the same memory
+    // location, and the memory state is the same as it was then (otherwise, it
+    // could have been overwritten later. See test32 in
+    // transforms/DeadStoreElimination/simple.ll).
+    if (auto *LI =
+            dyn_cast<LoadInst>(lookupOperandLeader(SI->getValueOperand()))) {
+      if ((lookupOperandLeader(LI->getPointerOperand()) ==
+           lookupOperandLeader(SI->getPointerOperand())) &&
+          (lookupMemoryLeader(MSSA->getMemoryAccess(LI)->getDefiningAccess()) ==
+           StoreRHS))
+        return createVariableExpression(LI);
+    }
   }
 
-  return createStoreExpression(SI, StoreAccess, B);
+  // If the store is not equivalent to anything, value number it as a store that
+  // produces a unique memory state (instead of using it's MemoryUse, we use
+  // it's MemoryDef).
+  return createStoreExpression(SI, StoreAccess);
 }
 
-const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I,
-                                                        const BasicBlock *B) {
+// See if we can extract the value of a loaded pointer from a load, a store, or
+// a memory instruction.
+const Expression *
+NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
+                                    LoadInst *LI, Instruction *DepInst,
+                                    MemoryAccess *DefiningAccess) {
+  assert((!LI || LI->isSimple()) && "Not a simple load");
+  if (auto *DepSI = dyn_cast<StoreInst>(DepInst)) {
+    // Can't forward from non-atomic to atomic without violating memory model.
+    // Also don't need to coerce if they are the same type, we will just
+    // propogate..
+    if (LI->isAtomic() > DepSI->isAtomic() ||
+        LoadType == DepSI->getValueOperand()->getType())
+      return nullptr;
+    int Offset = analyzeLoadFromClobberingStore(LoadType, LoadPtr, DepSI, DL);
+    if (Offset >= 0) {
+      if (auto *C = dyn_cast<Constant>(
+              lookupOperandLeader(DepSI->getValueOperand()))) {
+        DEBUG(dbgs() << "Coercing load from store " << *DepSI << " to constant "
+                     << *C << "\n");
+        return createConstantExpression(
+            getConstantStoreValueForLoad(C, Offset, LoadType, DL));
+      }
+    }
+
+  } else if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInst)) {
+    // Can't forward from non-atomic to atomic without violating memory model.
+    if (LI->isAtomic() > DepLI->isAtomic())
+      return nullptr;
+    int Offset = analyzeLoadFromClobberingLoad(LoadType, LoadPtr, DepLI, DL);
+    if (Offset >= 0) {
+      // We can coerce a constant load into a load
+      if (auto *C = dyn_cast<Constant>(lookupOperandLeader(DepLI)))
+        if (auto *PossibleConstant =
+                getConstantLoadValueForLoad(C, Offset, LoadType, DL)) {
+          DEBUG(dbgs() << "Coercing load from load " << *LI << " to constant "
+                       << *PossibleConstant << "\n");
+          return createConstantExpression(PossibleConstant);
+        }
+    }
+
+  } else if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInst)) {
+    int Offset = analyzeLoadFromClobberingMemInst(LoadType, LoadPtr, DepMI, DL);
+    if (Offset >= 0) {
+      if (auto *PossibleConstant =
+              getConstantMemInstValueForLoad(DepMI, Offset, LoadType, DL)) {
+        DEBUG(dbgs() << "Coercing load from meminst " << *DepMI
+                     << " to constant " << *PossibleConstant << "\n");
+        return createConstantExpression(PossibleConstant);
+      }
+    }
+  }
+
+  // All of the below are only true if the loaded pointer is produced
+  // by the dependent instruction.
+  if (LoadPtr != lookupOperandLeader(DepInst) &&
+      !AA->isMustAlias(LoadPtr, DepInst))
+    return nullptr;
+  // If this load really doesn't depend on anything, then we must be loading an
+  // undef value.  This can happen when loading for a fresh allocation with no
+  // intervening stores, for example.  Note that this is only true in the case
+  // that the result of the allocation is pointer equal to the load ptr.
+  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI)) {
+    return createConstantExpression(UndefValue::get(LoadType));
+  }
+  // If this load occurs either right after a lifetime begin,
+  // then the loaded value is undefined.
+  else if (auto *II = dyn_cast<IntrinsicInst>(DepInst)) {
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+      return createConstantExpression(UndefValue::get(LoadType));
+  }
+  // If this load follows a calloc (which zero initializes memory),
+  // then the loaded value is zero
+  else if (isCallocLikeFn(DepInst, TLI)) {
+    return createConstantExpression(Constant::getNullValue(LoadType));
+  }
+
+  return nullptr;
+}
+
+const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) {
   auto *LI = cast<LoadInst>(I);
 
   // We can eliminate in favor of non-simple loads, but we won't be able to
@@ -775,7 +1206,7 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I,
   if (!LI->isSimple())
     return nullptr;
 
-  Value *LoadAddressLeader = lookupOperandLeader(LI->getPointerOperand(), I, B);
+  Value *LoadAddressLeader = lookupOperandLeader(LI->getPointerOperand());
   // Load of undef is undef.
   if (isa<UndefValue>(LoadAddressLeader))
     return createConstantExpression(UndefValue::get(LI->getType()));
@@ -788,61 +1219,233 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I,
       // If the defining instruction is not reachable, replace with undef.
       if (!ReachableBlocks.count(DefiningInst->getParent()))
         return createConstantExpression(UndefValue::get(LI->getType()));
+      // This will handle stores and memory insts.  We only do if it the
+      // defining access has a different type, or it is a pointer produced by
+      // certain memory operations that cause the memory to have a fixed value
+      // (IE things like calloc).
+      if (const auto *CoercionResult =
+              performSymbolicLoadCoercion(LI->getType(), LoadAddressLeader, LI,
+                                          DefiningInst, DefiningAccess))
+        return CoercionResult;
     }
   }
 
-  const Expression *E =
-      createLoadExpression(LI->getType(), LI->getPointerOperand(), LI,
-                           lookupMemoryAccessEquiv(DefiningAccess), B);
+  const Expression *E = createLoadExpression(LI->getType(), LoadAddressLeader,
+                                             LI, DefiningAccess);
   return E;
 }
 
+const Expression *
+NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) {
+  auto *PI = PredInfo->getPredicateInfoFor(I);
+  if (!PI)
+    return nullptr;
+
+  DEBUG(dbgs() << "Found predicate info from instruction !\n");
+
+  auto *PWC = dyn_cast<PredicateWithCondition>(PI);
+  if (!PWC)
+    return nullptr;
+
+  auto *CopyOf = I->getOperand(0);
+  auto *Cond = PWC->Condition;
+
+  // If this a copy of the condition, it must be either true or false depending
+  // on the predicate info type and edge
+  if (CopyOf == Cond) {
+    // We should not need to add predicate users because the predicate info is
+    // already a use of this operand.
+    if (isa<PredicateAssume>(PI))
+      return createConstantExpression(ConstantInt::getTrue(Cond->getType()));
+    if (auto *PBranch = dyn_cast<PredicateBranch>(PI)) {
+      if (PBranch->TrueEdge)
+        return createConstantExpression(ConstantInt::getTrue(Cond->getType()));
+      return createConstantExpression(ConstantInt::getFalse(Cond->getType()));
+    }
+    if (auto *PSwitch = dyn_cast<PredicateSwitch>(PI))
+      return createConstantExpression(cast<Constant>(PSwitch->CaseValue));
+  }
+
+  // Not a copy of the condition, so see what the predicates tell us about this
+  // value.  First, though, we check to make sure the value is actually a copy
+  // of one of the condition operands. It's possible, in certain cases, for it
+  // to be a copy of a predicateinfo copy. In particular, if two branch
+  // operations use the same condition, and one branch dominates the other, we
+  // will end up with a copy of a copy.  This is currently a small deficiency in
+  // predicateinfo.  What will end up happening here is that we will value
+  // number both copies the same anyway.
+
+  // Everything below relies on the condition being a comparison.
+  auto *Cmp = dyn_cast<CmpInst>(Cond);
+  if (!Cmp)
+    return nullptr;
+
+  if (CopyOf != Cmp->getOperand(0) && CopyOf != Cmp->getOperand(1)) {
+    DEBUG(dbgs() << "Copy is not of any condition operands!");
+    return nullptr;
+  }
+  Value *FirstOp = lookupOperandLeader(Cmp->getOperand(0));
+  Value *SecondOp = lookupOperandLeader(Cmp->getOperand(1));
+  bool SwappedOps = false;
+  // Sort the ops
+  if (shouldSwapOperands(FirstOp, SecondOp)) {
+    std::swap(FirstOp, SecondOp);
+    SwappedOps = true;
+  }
+  CmpInst::Predicate Predicate =
+      SwappedOps ? Cmp->getSwappedPredicate() : Cmp->getPredicate();
+
+  if (isa<PredicateAssume>(PI)) {
+    // If the comparison is true when the operands are equal, then we know the
+    // operands are equal, because assumes must always be true.
+    if (CmpInst::isTrueWhenEqual(Predicate)) {
+      addPredicateUsers(PI, I);
+      return createVariableOrConstant(FirstOp);
+    }
+  }
+  if (const auto *PBranch = dyn_cast<PredicateBranch>(PI)) {
+    // If we are *not* a copy of the comparison, we may equal to the other
+    // operand when the predicate implies something about equality of
+    // operations.  In particular, if the comparison is true/false when the
+    // operands are equal, and we are on the right edge, we know this operation
+    // is equal to something.
+    if ((PBranch->TrueEdge && Predicate == CmpInst::ICMP_EQ) ||
+        (!PBranch->TrueEdge && Predicate == CmpInst::ICMP_NE)) {
+      addPredicateUsers(PI, I);
+      return createVariableOrConstant(FirstOp);
+    }
+    // Handle the special case of floating point.
+    if (((PBranch->TrueEdge && Predicate == CmpInst::FCMP_OEQ) ||
+         (!PBranch->TrueEdge && Predicate == CmpInst::FCMP_UNE)) &&
+        isa<ConstantFP>(FirstOp) && !cast<ConstantFP>(FirstOp)->isZero()) {
+      addPredicateUsers(PI, I);
+      return createConstantExpression(cast<Constant>(FirstOp));
+    }
+  }
+  return nullptr;
+}
+
 // Evaluate read only and pure calls, and create an expression result.
-const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I,
-                                                        const BasicBlock *B) {
+const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) {
   auto *CI = cast<CallInst>(I);
-  if (AA->doesNotAccessMemory(CI))
-    return createCallExpression(CI, nullptr, B);
-  if (AA->onlyReadsMemory(CI)) {
+  if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+    // Instrinsics with the returned attribute are copies of arguments.
+    if (auto *ReturnedValue = II->getReturnedArgOperand()) {
+      if (II->getIntrinsicID() == Intrinsic::ssa_copy)
+        if (const auto *Result = performSymbolicPredicateInfoEvaluation(I))
+          return Result;
+      return createVariableOrConstant(ReturnedValue);
+    }
+  }
+  if (AA->doesNotAccessMemory(CI)) {
+    return createCallExpression(CI, TOPClass->getMemoryLeader());
+  } else if (AA->onlyReadsMemory(CI)) {
     MemoryAccess *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(CI);
-    return createCallExpression(CI, lookupMemoryAccessEquiv(DefiningAccess), B);
+    return createCallExpression(CI, DefiningAccess);
   }
   return nullptr;
 }
 
-// Update the memory access equivalence table to say that From is equal to To,
+// Retrieve the memory class for a given MemoryAccess.
+CongruenceClass *NewGVN::getMemoryClass(const MemoryAccess *MA) const {
+
+  auto *Result = MemoryAccessToClass.lookup(MA);
+  assert(Result && "Should have found memory class");
+  return Result;
+}
+
+// Update the MemoryAccess equivalence table to say that From is equal to To,
 // and return true if this is different from what already existed in the table.
-bool NewGVN::setMemoryAccessEquivTo(MemoryAccess *From, MemoryAccess *To) {
-  DEBUG(dbgs() << "Setting " << *From << " equivalent to ");
-  if (!To)
-    DEBUG(dbgs() << "itself");
-  else
-    DEBUG(dbgs() << *To);
+bool NewGVN::setMemoryClass(const MemoryAccess *From,
+                            CongruenceClass *NewClass) {
+  assert(NewClass &&
+         "Every MemoryAccess should be getting mapped to a non-null class");
+  DEBUG(dbgs() << "Setting " << *From);
+  DEBUG(dbgs() << " equivalent to congruence class ");
+  DEBUG(dbgs() << NewClass->getID() << " with current MemoryAccess leader ");
+  DEBUG(dbgs() << *NewClass->getMemoryLeader());
   DEBUG(dbgs() << "\n");
-  auto LookupResult = MemoryAccessEquiv.find(From);
+
+  auto LookupResult = MemoryAccessToClass.find(From);
   bool Changed = false;
   // If it's already in the table, see if the value changed.
-  if (LookupResult != MemoryAccessEquiv.end()) {
-    if (To && LookupResult->second != To) {
+  if (LookupResult != MemoryAccessToClass.end()) {
+    auto *OldClass = LookupResult->second;
+    if (OldClass != NewClass) {
+      // If this is a phi, we have to handle memory member updates.
+      if (auto *MP = dyn_cast<MemoryPhi>(From)) {
+        OldClass->memory_erase(MP);
+        NewClass->memory_insert(MP);
+        // This may have killed the class if it had no non-memory members
+        if (OldClass->getMemoryLeader() == From) {
+          if (OldClass->memory_empty()) {
+            OldClass->setMemoryLeader(nullptr);
+          } else {
+            OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
+            DEBUG(dbgs() << "Memory class leader change for class "
+                         << OldClass->getID() << " to "
+                         << *OldClass->getMemoryLeader()
+                         << " due to removal of a memory member " << *From
+                         << "\n");
+            markMemoryLeaderChangeTouched(OldClass);
+          }
+        }
+      }
       // It wasn't equivalent before, and now it is.
-      LookupResult->second = To;
-      Changed = true;
-    } else if (!To) {
-      // It used to be equivalent to something, and now it's not.
-      MemoryAccessEquiv.erase(LookupResult);
+      LookupResult->second = NewClass;
       Changed = true;
     }
-  } else {
-    assert(!To &&
-           "Memory equivalence should never change from nothing to something");
   }
 
   return Changed;
 }
+
+// Determine if a phi is cycle-free.  That means the values in the phi don't
+// depend on any expressions that can change value as a result of the phi.
+// For example, a non-cycle free phi would be  v = phi(0, v+1).
+bool NewGVN::isCycleFree(const PHINode *PN) {
+  // In order to compute cycle-freeness, we do SCC finding on the phi, and see
+  // what kind of SCC it ends up in.  If it is a singleton, it is cycle-free.
+  // If it is not in a singleton, it is only cycle free if the other members are
+  // all phi nodes (as they do not compute anything, they are copies).  TODO:
+  // There are likely a few other intrinsics or expressions that could be
+  // included here, but this happens so infrequently already that it is not
+  // likely to be worth it.
+  auto PCS = PhiCycleState.lookup(PN);
+  if (PCS == PCS_Unknown) {
+    SCCFinder.Start(PN);
+    auto &SCC = SCCFinder.getComponentFor(PN);
+    // It's cycle free if it's size 1 or or the SCC is *only* phi nodes.
+    if (SCC.size() == 1)
+      PhiCycleState.insert({PN, PCS_CycleFree});
+    else {
+      bool AllPhis =
+          llvm::all_of(SCC, [](const Value *V) { return isa<PHINode>(V); });
+      PCS = AllPhis ? PCS_CycleFree : PCS_Cycle;
+      for (auto *Member : SCC)
+        if (auto *MemberPhi = dyn_cast<PHINode>(Member))
+          PhiCycleState.insert({MemberPhi, PCS});
+    }
+  }
+  if (PCS == PCS_Cycle)
+    return false;
+  return true;
+}
+
 // Evaluate PHI nodes symbolically, and create an expression result.
-const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I,
-                                                       const BasicBlock *B) {
-  auto *E = cast<PHIExpression>(createPHIExpression(I));
+const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) {
+  // True if one of the incoming phi edges is a backedge.
+  bool HasBackedge = false;
+  // All constant tracks the state of whether all the *original* phi operands
+  // were constant.
+  // This is really shorthand for "this phi cannot cycle due to forward
+  // propagation", as any
+  // change in value of the phi is guaranteed not to later change the value of
+  // the phi.
+  // IE it can't be v = phi(undef, v+1)
+  bool AllConstant = true;
+  auto *E =
+      cast<PHIExpression>(createPHIExpression(I, HasBackedge, AllConstant));
   // We match the semantics of SimplifyPhiNode from InstructionSimplify here.
 
   // See if all arguaments are the same.
@@ -861,14 +1464,15 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I,
   if (Filtered.begin() == Filtered.end()) {
     DEBUG(dbgs() << "Simplified PHI node " << *I << " to undef"
                  << "\n");
-    E->deallocateOperands(ArgRecycler);
-    ExpressionAllocator.Deallocate(E);
+    deleteExpression(E);
     return createConstantExpression(UndefValue::get(I->getType()));
   }
+  unsigned NumOps = 0;
   Value *AllSameValue = *(Filtered.begin());
   ++Filtered.begin();
   // Can't use std::equal here, sadly, because filter.begin moves.
-  if (llvm::all_of(Filtered, [AllSameValue](const Value *V) {
+  if (llvm::all_of(Filtered, [AllSameValue, &NumOps](const Value *V) {
+        ++NumOps;
         return V == AllSameValue;
       })) {
     // In LLVM's non-standard representation of phi nodes, it's possible to have
@@ -881,27 +1485,32 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I,
     // We also special case undef, so that if we have an undef, we can't use the
     // common value unless it dominates the phi block.
     if (HasUndef) {
+      // If we have undef and at least one other value, this is really a
+      // multivalued phi, and we need to know if it's cycle free in order to
+      // evaluate whether we can ignore the undef.  The other parts of this are
+      // just shortcuts.  If there is no backedge, or all operands are
+      // constants, or all operands are ignored but the undef, it also must be
+      // cycle free.
+      if (!AllConstant && HasBackedge && NumOps > 0 &&
+          !isa<UndefValue>(AllSameValue) && !isCycleFree(cast<PHINode>(I)))
+        return E;
+
       // Only have to check for instructions
       if (auto *AllSameInst = dyn_cast<Instruction>(AllSameValue))
-        if (!DT->dominates(AllSameInst, I))
+        if (!someEquivalentDominates(AllSameInst, I))
           return E;
     }
 
     NumGVNPhisAllSame++;
     DEBUG(dbgs() << "Simplified PHI node " << *I << " to " << *AllSameValue
                  << "\n");
-    E->deallocateOperands(ArgRecycler);
-    ExpressionAllocator.Deallocate(E);
-    if (auto *C = dyn_cast<Constant>(AllSameValue))
-      return createConstantExpression(C);
-    return createVariableExpression(AllSameValue);
+    deleteExpression(E);
+    return createVariableOrConstant(AllSameValue);
   }
   return E;
 }
 
-const Expression *
-NewGVN::performSymbolicAggrValueEvaluation(Instruction *I,
-                                           const BasicBlock *B) {
+const Expression *NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) {
   if (auto *EI = dyn_cast<ExtractValueInst>(I)) {
     auto *II = dyn_cast<IntrinsicInst>(EI->getAggregateOperand());
     if (II && EI->getNumIndices() == 1 && *EI->idx_begin() == 0) {
@@ -931,19 +1540,130 @@ NewGVN::performSymbolicAggrValueEvaluation(Instruction *I,
         // expression.
         assert(II->getNumArgOperands() == 2 &&
                "Expect two args for recognised intrinsics.");
-        return createBinaryExpression(Opcode, EI->getType(),
-                                      II->getArgOperand(0),
-                                      II->getArgOperand(1), B);
+        return createBinaryExpression(
+            Opcode, EI->getType(), II->getArgOperand(0), II->getArgOperand(1));
       }
     }
   }
 
-  return createAggregateValueExpression(I, B);
+  return createAggregateValueExpression(I);
+}
+const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) {
+  auto *CI = dyn_cast<CmpInst>(I);
+  // See if our operands are equal to those of a previous predicate, and if so,
+  // if it implies true or false.
+  auto Op0 = lookupOperandLeader(CI->getOperand(0));
+  auto Op1 = lookupOperandLeader(CI->getOperand(1));
+  auto OurPredicate = CI->getPredicate();
+  if (shouldSwapOperands(Op0, Op1)) {
+    std::swap(Op0, Op1);
+    OurPredicate = CI->getSwappedPredicate();
+  }
+
+  // Avoid processing the same info twice
+  const PredicateBase *LastPredInfo = nullptr;
+  // See if we know something about the comparison itself, like it is the target
+  // of an assume.
+  auto *CmpPI = PredInfo->getPredicateInfoFor(I);
+  if (dyn_cast_or_null<PredicateAssume>(CmpPI))
+    return createConstantExpression(ConstantInt::getTrue(CI->getType()));
+
+  if (Op0 == Op1) {
+    // This condition does not depend on predicates, no need to add users
+    if (CI->isTrueWhenEqual())
+      return createConstantExpression(ConstantInt::getTrue(CI->getType()));
+    else if (CI->isFalseWhenEqual())
+      return createConstantExpression(ConstantInt::getFalse(CI->getType()));
+  }
+
+  // NOTE: Because we are comparing both operands here and below, and using
+  // previous comparisons, we rely on fact that predicateinfo knows to mark
+  // comparisons that use renamed operands as users of the earlier comparisons.
+  // It is *not* enough to just mark predicateinfo renamed operands as users of
+  // the earlier comparisons, because the *other* operand may have changed in a
+  // previous iteration.
+  // Example:
+  // icmp slt %a, %b
+  // %b.0 = ssa.copy(%b)
+  // false branch:
+  // icmp slt %c, %b.0
+
+  // %c and %a may start out equal, and thus, the code below will say the second
+  // %icmp is false.  c may become equal to something else, and in that case the
+  // %second icmp *must* be reexamined, but would not if only the renamed
+  // %operands are considered users of the icmp.
+
+  // *Currently* we only check one level of comparisons back, and only mark one
+  // level back as touched when changes appen .  If you modify this code to look
+  // back farther through comparisons, you *must* mark the appropriate
+  // comparisons as users in PredicateInfo.cpp, or you will cause bugs.  See if
+  // we know something just from the operands themselves
+
+  // See if our operands have predicate info, so that we may be able to derive
+  // something from a previous comparison.
+  for (const auto &Op : CI->operands()) {
+    auto *PI = PredInfo->getPredicateInfoFor(Op);
+    if (const auto *PBranch = dyn_cast_or_null<PredicateBranch>(PI)) {
+      if (PI == LastPredInfo)
+        continue;
+      LastPredInfo = PI;
+
+      // TODO: Along the false edge, we may know more things too, like icmp of
+      // same operands is false.
+      // TODO: We only handle actual comparison conditions below, not and/or.
+      auto *BranchCond = dyn_cast<CmpInst>(PBranch->Condition);
+      if (!BranchCond)
+        continue;
+      auto *BranchOp0 = lookupOperandLeader(BranchCond->getOperand(0));
+      auto *BranchOp1 = lookupOperandLeader(BranchCond->getOperand(1));
+      auto BranchPredicate = BranchCond->getPredicate();
+      if (shouldSwapOperands(BranchOp0, BranchOp1)) {
+        std::swap(BranchOp0, BranchOp1);
+        BranchPredicate = BranchCond->getSwappedPredicate();
+      }
+      if (BranchOp0 == Op0 && BranchOp1 == Op1) {
+        if (PBranch->TrueEdge) {
+          // If we know the previous predicate is true and we are in the true
+          // edge then we may be implied true or false.
+          if (CmpInst::isImpliedTrueByMatchingCmp(OurPredicate,
+                                                  BranchPredicate)) {
+            addPredicateUsers(PI, I);
+            return createConstantExpression(
+                ConstantInt::getTrue(CI->getType()));
+          }
+
+          if (CmpInst::isImpliedFalseByMatchingCmp(OurPredicate,
+                                                   BranchPredicate)) {
+            addPredicateUsers(PI, I);
+            return createConstantExpression(
+                ConstantInt::getFalse(CI->getType()));
+          }
+
+        } else {
+          // Just handle the ne and eq cases, where if we have the same
+          // operands, we may know something.
+          if (BranchPredicate == OurPredicate) {
+            addPredicateUsers(PI, I);
+            // Same predicate, same ops,we know it was false, so this is false.
+            return createConstantExpression(
+                ConstantInt::getFalse(CI->getType()));
+          } else if (BranchPredicate ==
+                     CmpInst::getInversePredicate(OurPredicate)) {
+            addPredicateUsers(PI, I);
+            // Inverse predicate, we know the other was false, so this is true.
+            return createConstantExpression(
+                ConstantInt::getTrue(CI->getType()));
+          }
+        }
+      }
+    }
+  }
+  // Create expression will take care of simplifyCmpInst
+  return createExpression(I);
 }
 
 // Substitute and symbolize the value before value numbering.
-const Expression *NewGVN::performSymbolicEvaluation(Value *V,
-                                                    const BasicBlock *B) {
+const Expression *NewGVN::performSymbolicEvaluation(Value *V) {
   const Expression *E = nullptr;
   if (auto *C = dyn_cast<Constant>(V))
     E = createConstantExpression(C);
@@ -957,24 +1677,27 @@ const Expression *NewGVN::performSymbolicEvaluation(Value *V,
     switch (I->getOpcode()) {
     case Instruction::ExtractValue:
     case Instruction::InsertValue:
-      E = performSymbolicAggrValueEvaluation(I, B);
+      E = performSymbolicAggrValueEvaluation(I);
       break;
     case Instruction::PHI:
-      E = performSymbolicPHIEvaluation(I, B);
+      E = performSymbolicPHIEvaluation(I);
       break;
     case Instruction::Call:
-      E = performSymbolicCallEvaluation(I, B);
+      E = performSymbolicCallEvaluation(I);
       break;
     case Instruction::Store:
-      E = performSymbolicStoreEvaluation(I, B);
+      E = performSymbolicStoreEvaluation(I);
       break;
     case Instruction::Load:
-      E = performSymbolicLoadEvaluation(I, B);
+      E = performSymbolicLoadEvaluation(I);
       break;
     case Instruction::BitCast: {
-      E = createExpression(I, B);
+      E = createExpression(I);
+    } break;
+    case Instruction::ICmp:
+    case Instruction::FCmp: {
+      E = performSymbolicCmpEvaluation(I);
     } break;
-
     case Instruction::Add:
     case Instruction::FAdd:
     case Instruction::Sub:
@@ -993,8 +1716,6 @@ const Expression *NewGVN::performSymbolicEvaluation(Value *V,
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor:
-    case Instruction::ICmp:
-    case Instruction::FCmp:
     case Instruction::Trunc:
     case Instruction::ZExt:
     case Instruction::SExt:
@@ -1011,7 +1732,7 @@ const Expression *NewGVN::performSymbolicEvaluation(Value *V,
     case Instruction::InsertElement:
     case Instruction::ShuffleVector:
     case Instruction::GetElementPtr:
-      E = createExpression(I, B);
+      E = createExpression(I);
       break;
     default:
       return nullptr;
@@ -1020,129 +1741,297 @@ const Expression *NewGVN::performSymbolicEvaluation(Value *V,
   return E;
 }
 
-// There is an edge from 'Src' to 'Dst'.  Return true if every path from
-// the entry block to 'Dst' passes via this edge.  In particular 'Dst'
-// must not be reachable via another edge from 'Src'.
-bool NewGVN::isOnlyReachableViaThisEdge(const BasicBlockEdge &E) const {
-
-  // While in theory it is interesting to consider the case in which Dst has
-  // more than one predecessor, because Dst might be part of a loop which is
-  // only reachable from Src, in practice it is pointless since at the time
-  // GVN runs all such loops have preheaders, which means that Dst will have
-  // been changed to have only one predecessor, namely Src.
-  const BasicBlock *Pred = E.getEnd()->getSinglePredecessor();
-  const BasicBlock *Src = E.getStart();
-  assert((!Pred || Pred == Src) && "No edge between these basic blocks!");
-  (void)Src;
-  return Pred != nullptr;
-}
-
 void NewGVN::markUsersTouched(Value *V) {
   // Now mark the users as touched.
   for (auto *User : V->users()) {
     assert(isa<Instruction>(User) && "Use of value not within an instruction?");
-    TouchedInstructions.set(InstrDFS[User]);
+    TouchedInstructions.set(InstrToDFSNum(User));
   }
 }
 
-void NewGVN::markMemoryUsersTouched(MemoryAccess *MA) {
-  for (auto U : MA->users()) {
-    if (auto *MUD = dyn_cast<MemoryUseOrDef>(U))
-      TouchedInstructions.set(InstrDFS[MUD->getMemoryInst()]);
-    else
-      TouchedInstructions.set(InstrDFS[U]);
+void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) {
+  DEBUG(dbgs() << "Adding memory user " << *U << " to " << *To << "\n");
+  MemoryToUsers[To].insert(U);
+}
+
+void NewGVN::markMemoryDefTouched(const MemoryAccess *MA) {
+  TouchedInstructions.set(MemoryToDFSNum(MA));
+}
+
+void NewGVN::markMemoryUsersTouched(const MemoryAccess *MA) {
+  if (isa<MemoryUse>(MA))
+    return;
+  for (auto U : MA->users())
+    TouchedInstructions.set(MemoryToDFSNum(U));
+  const auto Result = MemoryToUsers.find(MA);
+  if (Result != MemoryToUsers.end()) {
+    for (auto *User : Result->second)
+      TouchedInstructions.set(MemoryToDFSNum(User));
+    MemoryToUsers.erase(Result);
+  }
+}
+
+// Add I to the set of users of a given predicate.
+void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) {
+  if (auto *PBranch = dyn_cast<PredicateBranch>(PB))
+    PredicateToUsers[PBranch->Condition].insert(I);
+  else if (auto *PAssume = dyn_cast<PredicateBranch>(PB))
+    PredicateToUsers[PAssume->Condition].insert(I);
+}
+
+// Touch all the predicates that depend on this instruction.
+void NewGVN::markPredicateUsersTouched(Instruction *I) {
+  const auto Result = PredicateToUsers.find(I);
+  if (Result != PredicateToUsers.end()) {
+    for (auto *User : Result->second)
+      TouchedInstructions.set(InstrToDFSNum(User));
+    PredicateToUsers.erase(Result);
   }
 }
 
+// Mark users affected by a memory leader change.
+void NewGVN::markMemoryLeaderChangeTouched(CongruenceClass *CC) {
+  for (auto M : CC->memory())
+    markMemoryDefTouched(M);
+}
+
 // Touch the instructions that need to be updated after a congruence class has a
 // leader change, and mark changed values.
-void NewGVN::markLeaderChangeTouched(CongruenceClass *CC) {
-  for (auto M : CC->Members) {
+void NewGVN::markValueLeaderChangeTouched(CongruenceClass *CC) {
+  for (auto M : *CC) {
     if (auto *I = dyn_cast<Instruction>(M))
-      TouchedInstructions.set(InstrDFS[I]);
+      TouchedInstructions.set(InstrToDFSNum(I));
     LeaderChanges.insert(M);
   }
 }
 
+// Give a range of things that have instruction DFS numbers, this will return
+// the member of the range with the smallest dfs number.
+template <class T, class Range>
+T *NewGVN::getMinDFSOfRange(const Range &R) const {
+  std::pair<T *, unsigned> MinDFS = {nullptr, ~0U};
+  for (const auto X : R) {
+    auto DFSNum = InstrToDFSNum(X);
+    if (DFSNum < MinDFS.second)
+      MinDFS = {X, DFSNum};
+  }
+  return MinDFS.first;
+}
+
+// This function returns the MemoryAccess that should be the next leader of
+// congruence class CC, under the assumption that the current leader is going to
+// disappear.
+const MemoryAccess *NewGVN::getNextMemoryLeader(CongruenceClass *CC) const {
+  // TODO: If this ends up to slow, we can maintain a next memory leader like we
+  // do for regular leaders.
+  // Make sure there will be a leader to find
+  assert((CC->getStoreCount() > 0 || !CC->memory_empty()) &&
+         "Can't get next leader if there is none");
+  if (CC->getStoreCount() > 0) {
+    if (auto *NL = dyn_cast_or_null<StoreInst>(CC->getNextLeader().first))
+      return MSSA->getMemoryAccess(NL);
+    // Find the store with the minimum DFS number.
+    auto *V = getMinDFSOfRange<Value>(make_filter_range(
+        *CC, [&](const Value *V) { return isa<StoreInst>(V); }));
+    return MSSA->getMemoryAccess(cast<StoreInst>(V));
+  }
+  assert(CC->getStoreCount() == 0);
+
+  // Given our assertion, hitting this part must mean
+  // !OldClass->memory_empty()
+  if (CC->memory_size() == 1)
+    return *CC->memory_begin();
+  return getMinDFSOfRange<const MemoryPhi>(CC->memory());
+}
+
+// This function returns the next value leader of a congruence class, under the
+// assumption that the current leader is going away.  This should end up being
+// the next most dominating member.
+Value *NewGVN::getNextValueLeader(CongruenceClass *CC) const {
+  // We don't need to sort members if there is only 1, and we don't care about
+  // sorting the TOP class because everything either gets out of it or is
+  // unreachable.
+
+  if (CC->size() == 1 || CC == TOPClass) {
+    return *(CC->begin());
+  } else if (CC->getNextLeader().first) {
+    ++NumGVNAvoidedSortedLeaderChanges;
+    return CC->getNextLeader().first;
+  } else {
+    ++NumGVNSortedLeaderChanges;
+    // NOTE: If this ends up to slow, we can maintain a dual structure for
+    // member testing/insertion, or keep things mostly sorted, and sort only
+    // here, or use SparseBitVector or ....
+    return getMinDFSOfRange<Value>(*CC);
+  }
+}
+
+// Move a MemoryAccess, currently in OldClass, to NewClass, including updates to
+// the memory members, etc for the move.
+//
+// The invariants of this function are:
+//
+// I must be moving to NewClass from OldClass The StoreCount of OldClass and
+// NewClass is expected to have been updated for I already if it is is a store.
+// The OldClass memory leader has not been updated yet if I was the leader.
+void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I,
+                                            MemoryAccess *InstMA,
+                                            CongruenceClass *OldClass,
+                                            CongruenceClass *NewClass) {
+  // If the leader is I, and we had a represenative MemoryAccess, it should
+  // be the MemoryAccess of OldClass.
+  assert((!InstMA || !OldClass->getMemoryLeader() ||
+          OldClass->getLeader() != I ||
+          OldClass->getMemoryLeader() == InstMA) &&
+         "Representative MemoryAccess mismatch");
+  // First, see what happens to the new class
+  if (!NewClass->getMemoryLeader()) {
+    // Should be a new class, or a store becoming a leader of a new class.
+    assert(NewClass->size() == 1 ||
+           (isa<StoreInst>(I) && NewClass->getStoreCount() == 1));
+    NewClass->setMemoryLeader(InstMA);
+    // Mark it touched if we didn't just create a singleton
+    DEBUG(dbgs() << "Memory class leader change for class " << NewClass->getID()
+                 << " due to new memory instruction becoming leader\n");
+    markMemoryLeaderChangeTouched(NewClass);
+  }
+  setMemoryClass(InstMA, NewClass);
+  // Now, fixup the old class if necessary
+  if (OldClass->getMemoryLeader() == InstMA) {
+    if (OldClass->getStoreCount() != 0 || !OldClass->memory_empty()) {
+      OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
+      DEBUG(dbgs() << "Memory class leader change for class "
+                   << OldClass->getID() << " to "
+                   << *OldClass->getMemoryLeader()
+                   << " due to removal of old leader " << *InstMA << "\n");
+      markMemoryLeaderChangeTouched(OldClass);
+    } else
+      OldClass->setMemoryLeader(nullptr);
+  }
+}
+
 // Move a value, currently in OldClass, to be part of NewClass
-// Update OldClass for the move (including changing leaders, etc)
-void NewGVN::moveValueToNewCongruenceClass(Instruction *I,
+// Update OldClass and NewClass for the move (including changing leaders, etc).
+void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
                                            CongruenceClass *OldClass,
                                            CongruenceClass *NewClass) {
-  DEBUG(dbgs() << "New congruence class for " << I << " is " << NewClass->ID
-               << "\n");
-
-  if (I == OldClass->NextLeader.first)
-    OldClass->NextLeader = {nullptr, ~0U};
+  if (I == OldClass->getNextLeader().first)
+    OldClass->resetNextLeader();
 
   // It's possible, though unlikely, for us to discover equivalences such
   // that the current leader does not dominate the old one.
   // This statistic tracks how often this happens.
   // We assert on phi nodes when this happens, currently, for debugging, because
   // we want to make sure we name phi node cycles properly.
-  if (isa<Instruction>(NewClass->RepLeader) && NewClass->RepLeader &&
-      I != NewClass->RepLeader &&
-      DT->properlyDominates(
-          I->getParent(),
-          cast<Instruction>(NewClass->RepLeader)->getParent())) {
-    ++NumGVNNotMostDominatingLeader;
-    assert(!isa<PHINode>(I) &&
-           "New class for instruction should not be dominated by instruction");
-  }
-
-  if (NewClass->RepLeader != I) {
-    auto DFSNum = InstrDFS.lookup(I);
-    if (DFSNum < NewClass->NextLeader.second)
-      NewClass->NextLeader = {I, DFSNum};
+  if (isa<Instruction>(NewClass->getLeader()) && NewClass->getLeader() &&
+      I != NewClass->getLeader()) {
+    auto *IBB = I->getParent();
+    auto *NCBB = cast<Instruction>(NewClass->getLeader())->getParent();
+    bool Dominated =
+        IBB == NCBB && InstrToDFSNum(I) < InstrToDFSNum(NewClass->getLeader());
+    Dominated = Dominated || DT->properlyDominates(IBB, NCBB);
+    if (Dominated) {
+      ++NumGVNNotMostDominatingLeader;
+      assert(
+          !isa<PHINode>(I) &&
+          "New class for instruction should not be dominated by instruction");
+    }
   }
 
-  OldClass->Members.erase(I);
-  NewClass->Members.insert(I);
-  if (isa<StoreInst>(I)) {
-    --OldClass->StoreCount;
-    assert(OldClass->StoreCount >= 0);
-    ++NewClass->StoreCount;
-    assert(NewClass->StoreCount > 0);
+  if (NewClass->getLeader() != I)
+    NewClass->addPossibleNextLeader({I, InstrToDFSNum(I)});
+
+  OldClass->erase(I);
+  NewClass->insert(I);
+  // Handle our special casing of stores.
+  if (auto *SI = dyn_cast<StoreInst>(I)) {
+    OldClass->decStoreCount();
+    // Okay, so when do we want to make a store a leader of a class?
+    // If we have a store defined by an earlier load, we want the earlier load
+    // to lead the class.
+    // If we have a store defined by something else, we want the store to lead
+    // the class so everything else gets the "something else" as a value.
+    // If we have a store as the single member of the class, we want the store
+    // as the leader
+    if (NewClass->getStoreCount() == 0 && !NewClass->getStoredValue()) {
+      // If it's a store expression we are using, it means we are not equivalent
+      // to something earlier.
+      if (isa<StoreExpression>(E)) {
+        assert(lookupOperandLeader(SI->getValueOperand()) !=
+               NewClass->getLeader());
+        NewClass->setStoredValue(lookupOperandLeader(SI->getValueOperand()));
+        markValueLeaderChangeTouched(NewClass);
+        // Shift the new class leader to be the store
+        DEBUG(dbgs() << "Changing leader of congruence class "
+                     << NewClass->getID() << " from " << *NewClass->getLeader()
+                     << " to  " << *SI << " because store joined class\n");
+        // If we changed the leader, we have to mark it changed because we don't
+        // know what it will do to symbolic evlauation.
+        NewClass->setLeader(SI);
+      }
+      // We rely on the code below handling the MemoryAccess change.
+    }
+    NewClass->incStoreCount();
   }
-
+  // True if there is no memory instructions left in a class that had memory
+  // instructions before.
+
+  // If it's not a memory use, set the MemoryAccess equivalence
+  auto *InstMA = dyn_cast_or_null<MemoryDef>(MSSA->getMemoryAccess(I));
+  bool InstWasMemoryLeader = InstMA && OldClass->getMemoryLeader() == InstMA;
+  if (InstMA)
+    moveMemoryToNewCongruenceClass(I, InstMA, OldClass, NewClass);
   ValueToClass[I] = NewClass;
   // See if we destroyed the class or need to swap leaders.
-  if (OldClass->Members.empty() && OldClass != InitialClass) {
-    if (OldClass->DefiningExpr) {
-      OldClass->Dead = true;
-      DEBUG(dbgs() << "Erasing expression " << OldClass->DefiningExpr
+  if (OldClass->empty() && OldClass != TOPClass) {
+    if (OldClass->getDefiningExpr()) {
+      DEBUG(dbgs() << "Erasing expression " << OldClass->getDefiningExpr()
                    << " from table\n");
-      ExpressionToClass.erase(OldClass->DefiningExpr);
+      ExpressionToClass.erase(OldClass->getDefiningExpr());
     }
-  } else if (OldClass->RepLeader == I) {
+  } else if (OldClass->getLeader() == I) {
     // When the leader changes, the value numbering of
     // everything may change due to symbolization changes, so we need to
     // reprocess.
-    DEBUG(dbgs() << "Leader change!\n");
+    DEBUG(dbgs() << "Value class leader change for class " << OldClass->getID()
+                 << "\n");
     ++NumGVNLeaderChanges;
-    // We don't need to sort members if there is only 1, and we don't care about
-    // sorting the initial class because everything either gets out of it or is
-    // unreachable.
-    if (OldClass->Members.size() == 1 || OldClass == InitialClass) {
-      OldClass->RepLeader = *(OldClass->Members.begin());
-    } else if (OldClass->NextLeader.first) {
-      ++NumGVNAvoidedSortedLeaderChanges;
-      OldClass->RepLeader = OldClass->NextLeader.first;
-      OldClass->NextLeader = {nullptr, ~0U};
-    } else {
-      ++NumGVNSortedLeaderChanges;
-      // TODO: If this ends up to slow, we can maintain a dual structure for
-      // member testing/insertion, or keep things mostly sorted, and sort only
-      // here, or ....
-      std::pair<Value *, unsigned> MinDFS = {nullptr, ~0U};
-      for (const auto X : OldClass->Members) {
-        auto DFSNum = InstrDFS.lookup(X);
-        if (DFSNum < MinDFS.second)
-          MinDFS = {X, DFSNum};
-      }
-      OldClass->RepLeader = MinDFS.first;
+    // Destroy the stored value if there are no more stores to represent it.
+    // Note that this is basically clean up for the expression removal that
+    // happens below.  If we remove stores from a class, we may leave it as a
+    // class of equivalent memory phis.
+    if (OldClass->getStoreCount() == 0) {
+      if (OldClass->getStoredValue())
+        OldClass->setStoredValue(nullptr);
+    }
+    // If we destroy the old access leader and it's a store, we have to
+    // effectively destroy the congruence class.  When it comes to scalars,
+    // anything with the same value is as good as any other.  That means that
+    // one leader is as good as another, and as long as you have some leader for
+    // the value, you are good.. When it comes to *memory states*, only one
+    // particular thing really represents the definition of a given memory
+    // state.  Once it goes away, we need to re-evaluate which pieces of memory
+    // are really still equivalent. The best way to do this is to re-value
+    // number things.  The only way to really make that happen is to destroy the
+    // rest of the class.  In order to effectively destroy the class, we reset
+    // ExpressionToClass for each by using the ValueToExpression mapping.  The
+    // members later get marked as touched due to the leader change.  We will
+    // create new congruence classes, and the pieces that are still equivalent
+    // will end back together in a new class.  If this becomes too expensive, it
+    // is possible to use a versioning scheme for the congruence classes to
+    // avoid the expressions finding this old class.  Note that the situation is
+    // different for memory phis, becuase they are evaluated anew each time, and
+    // they become equal not by hashing, but by seeing if all operands are the
+    // same (or only one is reachable).
+    if (OldClass->getStoreCount() > 0 && InstWasMemoryLeader) {
+      DEBUG(dbgs() << "Kicking everything out of class " << OldClass->getID()
+                   << " because MemoryAccess leader changed");
+      for (auto Member : *OldClass)
+        ExpressionToClass.erase(ValueToExpression.lookup(Member));
     }
-    markLeaderChangeTouched(OldClass);
+    OldClass->setLeader(getNextValueLeader(OldClass));
+    OldClass->resetNextLeader();
+    markValueLeaderChangeTouched(OldClass);
   }
 }
 
@@ -1150,12 +2039,12 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I,
 void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
   ValueToExpression[I] = E;
   // This is guaranteed to return something, since it will at least find
-  // INITIAL.
+  // TOP.
 
   CongruenceClass *IClass = ValueToClass[I];
   assert(IClass && "Should have found a IClass");
   // Dead classes should have been eliminated from the mapping.
-  assert(!IClass->Dead && "Found a dead class");
+  assert(!IClass->isDead() && "Found a dead class");
 
   CongruenceClass *EClass;
   if (const auto *VE = dyn_cast<VariableExpression>(E)) {
@@ -1171,79 +2060,52 @@ void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
 
       // Constants and variables should always be made the leader.
       if (const auto *CE = dyn_cast<ConstantExpression>(E)) {
-        NewClass->RepLeader = CE->getConstantValue();
+        NewClass->setLeader(CE->getConstantValue());
       } else if (const auto *SE = dyn_cast<StoreExpression>(E)) {
         StoreInst *SI = SE->getStoreInst();
-        NewClass->RepLeader =
-            lookupOperandLeader(SI->getValueOperand(), SI, SI->getParent());
+        NewClass->setLeader(SI);
+        NewClass->setStoredValue(lookupOperandLeader(SI->getValueOperand()));
+        // The RepMemoryAccess field will be filled in properly by the
+        // moveValueToNewCongruenceClass call.
       } else {
-        NewClass->RepLeader = I;
+        NewClass->setLeader(I);
       }
       assert(!isa<VariableExpression>(E) &&
              "VariableExpression should have been handled already");
 
       EClass = NewClass;
       DEBUG(dbgs() << "Created new congruence class for " << *I
-                   << " using expression " << *E << " at " << NewClass->ID
-                   << " and leader " << *(NewClass->RepLeader) << "\n");
-      DEBUG(dbgs() << "Hash value was " << E->getHashValue() << "\n");
+                   << " using expression " << *E << " at " << NewClass->getID()
+                   << " and leader " << *(NewClass->getLeader()));
+      if (NewClass->getStoredValue())
+        DEBUG(dbgs() << " and stored value " << *(NewClass->getStoredValue()));
+      DEBUG(dbgs() << "\n");
     } else {
       EClass = lookupResult.first->second;
       if (isa<ConstantExpression>(E))
-        assert(isa<Constant>(EClass->RepLeader) &&
+        assert((isa<Constant>(EClass->getLeader()) ||
+                (EClass->getStoredValue() &&
+                 isa<Constant>(EClass->getStoredValue()))) &&
                "Any class with a constant expression should have a "
                "constant leader");
 
       assert(EClass && "Somehow don't have an eclass");
 
-      assert(!EClass->Dead && "We accidentally looked up a dead class");
+      assert(!EClass->isDead() && "We accidentally looked up a dead class");
     }
   }
   bool ClassChanged = IClass != EClass;
   bool LeaderChanged = LeaderChanges.erase(I);
   if (ClassChanged || LeaderChanged) {
-    DEBUG(dbgs() << "Found class " << EClass->ID << " for expression " << E
+    DEBUG(dbgs() << "New class " << EClass->getID() << " for expression " << *E
                  << "\n");
-
     if (ClassChanged)
-      moveValueToNewCongruenceClass(I, IClass, EClass);
+      moveValueToNewCongruenceClass(I, E, IClass, EClass);
     markUsersTouched(I);
-    if (MemoryAccess *MA = MSSA->getMemoryAccess(I)) {
-      // If this is a MemoryDef, we need to update the equivalence table. If
-      // we determined the expression is congruent to a different memory
-      // state, use that different memory state.  If we determined it didn't,
-      // we update that as well.  Right now, we only support store
-      // expressions.
-      if (!isa<MemoryUse>(MA) && isa<StoreExpression>(E) &&
-          EClass->Members.size() != 1) {
-        auto *DefAccess = cast<StoreExpression>(E)->getDefiningAccess();
-        setMemoryAccessEquivTo(MA, DefAccess != MA ? DefAccess : nullptr);
-      } else {
-        setMemoryAccessEquivTo(MA, nullptr);
-      }
+    if (MemoryAccess *MA = MSSA->getMemoryAccess(I))
       markMemoryUsersTouched(MA);
-    }
-  } else if (auto *SI = dyn_cast<StoreInst>(I)) {
-    // There is, sadly, one complicating thing for stores.  Stores do not
-    // produce values, only consume them.  However, in order to make loads and
-    // stores value number the same, we ignore the value operand of the store.
-    // But the value operand will still be the leader of our class, and thus, it
-    // may change.  Because the store is a use, the store will get reprocessed,
-    // but nothing will change about it, and so nothing above will catch it
-    // (since the class will not change).  In order to make sure everything ends
-    // up okay, we need to recheck the leader of the class.  Since stores of
-    // different values value number differently due to different memorydefs, we
-    // are guaranteed the leader is always the same between stores in the same
-    // class.
-    DEBUG(dbgs() << "Checking store leader\n");
-    auto ProperLeader =
-        lookupOperandLeader(SI->getValueOperand(), SI, SI->getParent());
-    if (EClass->RepLeader != ProperLeader) {
-      DEBUG(dbgs() << "Store leader changed, fixing\n");
-      EClass->RepLeader = ProperLeader;
-      markLeaderChangeTouched(EClass);
-      markMemoryUsersTouched(MSSA->getMemoryAccess(SI));
-    }
+    if (auto *CI = dyn_cast<CmpInst>(I))
+      markPredicateUsersTouched(CI);
   }
 }
 
@@ -1267,11 +2129,11 @@ void NewGVN::updateReachableEdge(BasicBlock *From, BasicBlock *To) {
       // they are the only thing that depend on new edges. Anything using their
       // values will get propagated to if necessary.
       if (MemoryAccess *MemPhi = MSSA->getMemoryAccess(To))
-        TouchedInstructions.set(InstrDFS[MemPhi]);
+        TouchedInstructions.set(InstrToDFSNum(MemPhi));
 
       auto BI = To->begin();
       while (isa<PHINode>(BI)) {
-        TouchedInstructions.set(InstrDFS[&*BI]);
+        TouchedInstructions.set(InstrToDFSNum(&*BI));
         ++BI;
       }
     }
@@ -1280,8 +2142,8 @@ void NewGVN::updateReachableEdge(BasicBlock *From, BasicBlock *To) {
 
 // Given a predicate condition (from a switch, cmp, or whatever) and a block,
 // see if we know some constant value for it already.
-Value *NewGVN::findConditionEquivalence(Value *Cond, BasicBlock *B) const {
-  auto Result = lookupOperandLeader(Cond, nullptr, B);
+Value *NewGVN::findConditionEquivalence(Value *Cond) const {
+  auto Result = lookupOperandLeader(Cond);
   if (isa<Constant>(Result))
     return Result;
   return nullptr;
@@ -1293,10 +2155,10 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
   BranchInst *BR;
   if ((BR = dyn_cast<BranchInst>(TI)) && BR->isConditional()) {
     Value *Cond = BR->getCondition();
-    Value *CondEvaluated = findConditionEquivalence(Cond, B);
+    Value *CondEvaluated = findConditionEquivalence(Cond);
     if (!CondEvaluated) {
       if (auto *I = dyn_cast<Instruction>(Cond)) {
-        const Expression *E = createExpression(I, B);
+        const Expression *E = createExpression(I);
         if (const auto *CE = dyn_cast<ConstantExpression>(E)) {
           CondEvaluated = CE->getConstantValue();
         }
@@ -1329,13 +2191,13 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
     SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges;
 
     Value *SwitchCond = SI->getCondition();
-    Value *CondEvaluated = findConditionEquivalence(SwitchCond, B);
+    Value *CondEvaluated = findConditionEquivalence(SwitchCond);
     // See if we were able to turn this switch statement into a constant.
     if (CondEvaluated && isa<ConstantInt>(CondEvaluated)) {
       auto *CondVal = cast<ConstantInt>(CondEvaluated);
       // We should be able to get case value for this.
-      auto CaseVal = SI->findCaseValue(CondVal);
-      if (CaseVal.getCaseSuccessor() == SI->getDefaultDest()) {
+      auto Case = *SI->findCaseValue(CondVal);
+      if (Case.getCaseSuccessor() == SI->getDefaultDest()) {
         // We proved the value is outside of the range of the case.
         // We can't do anything other than mark the default dest as reachable,
         // and go home.
@@ -1343,7 +2205,7 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
         return;
       }
       // Now get where it goes and mark it reachable.
-      BasicBlock *TargetBlock = CaseVal.getCaseSuccessor();
+      BasicBlock *TargetBlock = Case.getCaseSuccessor();
       updateReachableEdge(B, TargetBlock);
     } else {
       for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) {
@@ -1361,45 +2223,66 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
     }
 
     // This also may be a memory defining terminator, in which case, set it
-    // equivalent to nothing.
-    if (MemoryAccess *MA = MSSA->getMemoryAccess(TI))
-      setMemoryAccessEquivTo(MA, nullptr);
+    // equivalent only to itself.
+    //
+    auto *MA = MSSA->getMemoryAccess(TI);
+    if (MA && !isa<MemoryUse>(MA)) {
+      auto *CC = ensureLeaderOfMemoryClass(MA);
+      if (setMemoryClass(MA, CC))
+        markMemoryUsersTouched(MA);
+    }
   }
 }
 
-// The algorithm initially places the values of the routine in the INITIAL
-// congruence
-// class. The leader of INITIAL is the undetermined value `TOP`.
-// When the algorithm has finished, values still in INITIAL are unreachable.
+// The algorithm initially places the values of the routine in the TOP
+// congruence class. The leader of TOP is the undetermined value `undef`.
+// When the algorithm has finished, values still in TOP are unreachable.
 void NewGVN::initializeCongruenceClasses(Function &F) {
-  // FIXME now i can't remember why this is 2
-  NextCongruenceNum = 2;
-  // Initialize all other instructions to be in INITIAL class.
-  CongruenceClass::MemberSet InitialValues;
-  InitialClass = createCongruenceClass(nullptr, nullptr);
+  NextCongruenceNum = 0;
+
+  // Note that even though we use the live on entry def as a representative
+  // MemoryAccess, it is *not* the same as the actual live on entry def. We
+  // have no real equivalemnt to undef for MemoryAccesses, and so we really
+  // should be checking whether the MemoryAccess is top if we want to know if it
+  // is equivalent to everything.  Otherwise, what this really signifies is that
+  // the access "it reaches all the way back to the beginning of the function"
+
+  // Initialize all other instructions to be in TOP class.
+  TOPClass = createCongruenceClass(nullptr, nullptr);
+  TOPClass->setMemoryLeader(MSSA->getLiveOnEntryDef());
+  //  The live on entry def gets put into it's own class
+  MemoryAccessToClass[MSSA->getLiveOnEntryDef()] =
+      createMemoryClass(MSSA->getLiveOnEntryDef());
+
   for (auto &B : F) {
-    if (auto *MP = MSSA->getMemoryAccess(&B))
-      MemoryAccessEquiv.insert({MP, MSSA->getLiveOnEntryDef()});
+    // All MemoryAccesses are equivalent to live on entry to start. They must
+    // be initialized to something so that initial changes are noticed. For
+    // the maximal answer, we initialize them all to be the same as
+    // liveOnEntry.
+    auto *MemoryBlockDefs = MSSA->getBlockDefs(&B);
+    if (MemoryBlockDefs)
+      for (const auto &Def : *MemoryBlockDefs) {
+        MemoryAccessToClass[&Def] = TOPClass;
+        auto *MD = dyn_cast<MemoryDef>(&Def);
+        // Insert the memory phis into the member list.
+        if (!MD) {
+          const MemoryPhi *MP = cast<MemoryPhi>(&Def);
+          TOPClass->memory_insert(MP);
+          MemoryPhiState.insert({MP, MPS_TOP});
+        }
 
-    for (auto &I : B) {
-      InitialValues.insert(&I);
-      ValueToClass[&I] = InitialClass;
-      // All memory accesses are equivalent to live on entry to start. They must
-      // be initialized to something so that initial changes are noticed. For
-      // the maximal answer, we initialize them all to be the same as
-      // liveOnEntry.  Note that to save time, we only initialize the
-      // MemoryDef's for stores and all MemoryPhis to be equal.  Right now, no
-      // other expression can generate a memory equivalence.  If we start
-      // handling memcpy/etc, we can expand this.
-      if (isa<StoreInst>(&I)) {
-        MemoryAccessEquiv.insert(
-            {MSSA->getMemoryAccess(&I), MSSA->getLiveOnEntryDef()});
-        ++InitialClass->StoreCount;
-        assert(InitialClass->StoreCount > 0);
+        if (MD && isa<StoreInst>(MD->getMemoryInst()))
+          TOPClass->incStoreCount();
       }
+    for (auto &I : B) {
+      // Don't insert void terminators into the class. We don't value number
+      // them, and they just end up sitting in TOP.
+      if (isa<TerminatorInst>(I) && I.getType()->isVoidTy())
+        continue;
+      TOPClass->insert(&I);
+      ValueToClass[&I] = TOPClass;
     }
   }
-  InitialClass->Members.swap(InitialValues);
 
   // Initialize arguments to be in their own unique congruence classes
   for (auto &FA : F.args())
@@ -1408,8 +2291,8 @@ void NewGVN::initializeCongruenceClasses(Function &F) {
 
 void NewGVN::cleanupTables() {
   for (unsigned i = 0, e = CongruenceClasses.size(); i != e; ++i) {
-    DEBUG(dbgs() << "Congruence class " << CongruenceClasses[i]->ID << " has "
-                 << CongruenceClasses[i]->Members.size() << " members\n");
+    DEBUG(dbgs() << "Congruence class " << CongruenceClasses[i]->getID()
+                 << " has " << CongruenceClasses[i]->size() << " members\n");
     // Make sure we delete the congruence class (probably worth switching to
     // a unique_ptr at some point.
     delete CongruenceClasses[i];
@@ -1427,15 +2310,14 @@ void NewGVN::cleanupTables() {
 #ifndef NDEBUG
   ProcessedCount.clear();
 #endif
-  DFSDomMap.clear();
   InstrDFS.clear();
   InstructionsToErase.clear();
-
   DFSToInstr.clear();
   BlockInstRange.clear();
   TouchedInstructions.clear();
-  DominatedInstRange.clear();
-  MemoryAccessEquiv.clear();
+  MemoryAccessToClass.clear();
+  PredicateToUsers.clear();
+  MemoryToUsers.clear();
 }
 
 std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
@@ -1447,6 +2329,16 @@ std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
   }
 
   for (auto &I : *B) {
+    // There's no need to call isInstructionTriviallyDead more than once on
+    // an instruction. Therefore, once we know that an instruction is dead
+    // we change its DFS number so that it doesn't get value numbered.
+    if (isInstructionTriviallyDead(&I, TLI)) {
+      InstrDFS[&I] = 0;
+      DEBUG(dbgs() << "Skipping trivially dead instruction " << I << "\n");
+      markInstructionForDeletion(&I);
+      continue;
+    }
+
     InstrDFS[&I] = End++;
     DFSToInstr.emplace_back(&I);
   }
@@ -1462,7 +2354,7 @@ void NewGVN::updateProcessedCount(Value *V) {
   if (ProcessedCount.count(V) == 0) {
     ProcessedCount.insert({V, 1});
   } else {
-    ProcessedCount[V] += 1;
+    ++ProcessedCount[V];
     assert(ProcessedCount[V] < 100 &&
            "Seem to have processed the same Value a lot");
   }
@@ -1472,26 +2364,33 @@ void NewGVN::updateProcessedCount(Value *V) {
 void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
   // If all the arguments are the same, the MemoryPhi has the same value as the
   // argument.
-  // Filter out unreachable blocks from our operands.
+  // Filter out unreachable blocks and self phis from our operands.
+  const BasicBlock *PHIBlock = MP->getBlock();
   auto Filtered = make_filter_range(MP->operands(), [&](const Use &U) {
-    return ReachableBlocks.count(MP->getIncomingBlock(U));
+    return lookupMemoryLeader(cast<MemoryAccess>(U)) != MP &&
+           !isMemoryAccessTop(cast<MemoryAccess>(U)) &&
+           ReachableEdges.count({MP->getIncomingBlock(U), PHIBlock});
   });
-
-  assert(Filtered.begin() != Filtered.end() &&
-         "We should not be processing a MemoryPhi in a completely "
-         "unreachable block");
+  // If all that is left is nothing, our memoryphi is undef. We keep it as
+  // InitialClass.  Note: The only case this should happen is if we have at
+  // least one self-argument.
+  if (Filtered.begin() == Filtered.end()) {
+    if (setMemoryClass(MP, TOPClass))
+      markMemoryUsersTouched(MP);
+    return;
+  }
 
   // Transform the remaining operands into operand leaders.
   // FIXME: mapped_iterator should have a range version.
   auto LookupFunc = [&](const Use &U) {
-    return lookupMemoryAccessEquiv(cast<MemoryAccess>(U));
+    return lookupMemoryLeader(cast<MemoryAccess>(U));
   };
   auto MappedBegin = map_iterator(Filtered.begin(), LookupFunc);
   auto MappedEnd = map_iterator(Filtered.end(), LookupFunc);
 
   // and now check if all the elements are equal.
   // Sadly, we can't use std::equals since these are random access iterators.
-  MemoryAccess *AllSameValue = *MappedBegin;
+  const auto *AllSameValue = *MappedBegin;
   ++MappedBegin;
   bool AllEqual = std::all_of(
       MappedBegin, MappedEnd,
@@ -1501,8 +2400,18 @@ void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
     DEBUG(dbgs() << "Memory Phi value numbered to " << *AllSameValue << "\n");
   else
     DEBUG(dbgs() << "Memory Phi value numbered to itself\n");
-
-  if (setMemoryAccessEquivTo(MP, AllEqual ? AllSameValue : nullptr))
+  // If it's equal to something, it's in that class. Otherwise, it has to be in
+  // a class where it is the leader (other things may be equivalent to it, but
+  // it needs to start off in its own class, which means it must have been the
+  // leader, and it can't have stopped being the leader because it was never
+  // removed).
+  CongruenceClass *CC =
+      AllEqual ? getMemoryClass(AllSameValue) : ensureLeaderOfMemoryClass(MP);
+  auto OldState = MemoryPhiState.lookup(MP);
+  assert(OldState != MPS_Invalid && "Invalid memory phi state");
+  auto NewState = AllEqual ? MPS_Equivalent : MPS_Unique;
+  MemoryPhiState[MP] = NewState;
+  if (setMemoryClass(MP, CC) || OldState != NewState)
     markMemoryUsersTouched(MP);
 }
 
@@ -1510,21 +2419,25 @@ void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
 // congruence finding, and updating mappings.
 void NewGVN::valueNumberInstruction(Instruction *I) {
   DEBUG(dbgs() << "Processing instruction " << *I << "\n");
-  if (isInstructionTriviallyDead(I, TLI)) {
-    DEBUG(dbgs() << "Skipping unused instruction\n");
-    markInstructionForDeletion(I);
-    return;
-  }
   if (!I->isTerminator()) {
-    const auto *Symbolized = performSymbolicEvaluation(I, I->getParent());
+    const Expression *Symbolized = nullptr;
+    if (DebugCounter::shouldExecute(VNCounter)) {
+      Symbolized = performSymbolicEvaluation(I);
+    } else {
+      // Mark the instruction as unused so we don't value number it again.
+      InstrDFS[I] = 0;
+    }
     // If we couldn't come up with a symbolic expression, use the unknown
     // expression
-    if (Symbolized == nullptr)
+    if (Symbolized == nullptr) {
       Symbolized = createUnknownExpression(I);
+    }
+
     performCongruenceFinding(I, Symbolized);
   } else {
     // Handle terminators that return values. All of them produce values we
-    // don't currently understand.
+    // don't currently understand.  We don't place non-value producing
+    // terminators in a class.
     if (!I->getType()->isVoidTy()) {
       auto *Symbolized = createUnknownExpression(I);
       performCongruenceFinding(I, Symbolized);
@@ -1539,72 +2452,102 @@ bool NewGVN::singleReachablePHIPath(const MemoryAccess *First,
                                     const MemoryAccess *Second) const {
   if (First == Second)
     return true;
-
-  if (auto *FirstDef = dyn_cast<MemoryUseOrDef>(First)) {
-    auto *DefAccess = FirstDef->getDefiningAccess();
-    return singleReachablePHIPath(DefAccess, Second);
-  } else {
-    auto *MP = cast<MemoryPhi>(First);
-    auto ReachableOperandPred = [&](const Use &U) {
-      return ReachableBlocks.count(MP->getIncomingBlock(U));
-    };
-    auto FilteredPhiArgs =
-        make_filter_range(MP->operands(), ReachableOperandPred);
-    SmallVector<const Value *, 32> OperandList;
-    std::copy(FilteredPhiArgs.begin(), FilteredPhiArgs.end(),
-              std::back_inserter(OperandList));
-    bool Okay = OperandList.size() == 1;
-    if (!Okay)
-      Okay = std::equal(OperandList.begin(), OperandList.end(),
-                        OperandList.begin());
-    if (Okay)
-      return singleReachablePHIPath(cast<MemoryAccess>(OperandList[0]), Second);
+  if (MSSA->isLiveOnEntryDef(First))
     return false;
+
+  const auto *EndDef = First;
+  for (auto *ChainDef : optimized_def_chain(First)) {
+    if (ChainDef == Second)
+      return true;
+    if (MSSA->isLiveOnEntryDef(ChainDef))
+      return false;
+    EndDef = ChainDef;
   }
+  auto *MP = cast<MemoryPhi>(EndDef);
+  auto ReachableOperandPred = [&](const Use &U) {
+    return ReachableEdges.count({MP->getIncomingBlock(U), MP->getBlock()});
+  };
+  auto FilteredPhiArgs =
+      make_filter_range(MP->operands(), ReachableOperandPred);
+  SmallVector<const Value *, 32> OperandList;
+  std::copy(FilteredPhiArgs.begin(), FilteredPhiArgs.end(),
+            std::back_inserter(OperandList));
+  bool Okay = OperandList.size() == 1;
+  if (!Okay)
+    Okay =
+        std::equal(OperandList.begin(), OperandList.end(), OperandList.begin());
+  if (Okay)
+    return singleReachablePHIPath(cast<MemoryAccess>(OperandList[0]), Second);
+  return false;
 }
 
 // Verify the that the memory equivalence table makes sense relative to the
 // congruence classes.  Note that this checking is not perfect, and is currently
-// subject to very rare false negatives. It is only useful for testing/debugging.
+// subject to very rare false negatives. It is only useful for
+// testing/debugging.
 void NewGVN::verifyMemoryCongruency() const {
-  // Anything equivalent in the memory access table should be in the same
+#ifndef NDEBUG
+  // Verify that the memory table equivalence and memory member set match
+  for (const auto *CC : CongruenceClasses) {
+    if (CC == TOPClass || CC->isDead())
+      continue;
+    if (CC->getStoreCount() != 0) {
+      assert((CC->getStoredValue() || !isa<StoreInst>(CC->getLeader())) &&
+             "Any class with a store as a "
+             "leader should have a "
+             "representative stored value\n");
+      assert(CC->getMemoryLeader() &&
+             "Any congruence class with a store should "
+             "have a representative access\n");
+    }
+
+    if (CC->getMemoryLeader())
+      assert(MemoryAccessToClass.lookup(CC->getMemoryLeader()) == CC &&
+             "Representative MemoryAccess does not appear to be reverse "
+             "mapped properly");
+    for (auto M : CC->memory())
+      assert(MemoryAccessToClass.lookup(M) == CC &&
+             "Memory member does not appear to be reverse mapped properly");
+  }
+
+  // Anything equivalent in the MemoryAccess table should be in the same
   // congruence class.
 
   // Filter out the unreachable and trivially dead entries, because they may
   // never have been updated if the instructions were not processed.
   auto ReachableAccessPred =
-      [&](const std::pair<const MemoryAccess *, MemoryAccess *> Pair) {
+      [&](const std::pair<const MemoryAccess *, CongruenceClass *> Pair) {
         bool Result = ReachableBlocks.count(Pair.first->getBlock());
         if (!Result)
           return false;
+        if (MSSA->isLiveOnEntryDef(Pair.first))
+          return true;
         if (auto *MemDef = dyn_cast<MemoryDef>(Pair.first))
           return !isInstructionTriviallyDead(MemDef->getMemoryInst());
+        if (MemoryToDFSNum(Pair.first) == 0)
+          return false;
         return true;
       };
 
-  auto Filtered = make_filter_range(MemoryAccessEquiv, ReachableAccessPred);
+  auto Filtered = make_filter_range(MemoryAccessToClass, ReachableAccessPred);
   for (auto KV : Filtered) {
-    assert(KV.first != KV.second &&
-           "We added a useless equivalence to the memory equivalence table");
-    // Unreachable instructions may not have changed because we never process
-    // them.
-    if (!ReachableBlocks.count(KV.first->getBlock()))
-      continue;
+    assert(KV.second != TOPClass &&
+           "Memory not unreachable but ended up in TOP");
     if (auto *FirstMUD = dyn_cast<MemoryUseOrDef>(KV.first)) {
-      auto *SecondMUD = dyn_cast<MemoryUseOrDef>(KV.second);
+      auto *SecondMUD = dyn_cast<MemoryUseOrDef>(KV.second->getMemoryLeader());
       if (FirstMUD && SecondMUD)
         assert((singleReachablePHIPath(FirstMUD, SecondMUD) ||
-               ValueToClass.lookup(FirstMUD->getMemoryInst()) ==
-                       ValueToClass.lookup(SecondMUD->getMemoryInst())) &&
-                   "The instructions for these memory operations should have "
-                   "been in the same congruence class or reachable through"
-                   "a single argument phi");
+                ValueToClass.lookup(FirstMUD->getMemoryInst()) ==
+                    ValueToClass.lookup(SecondMUD->getMemoryInst())) &&
+               "The instructions for these memory operations should have "
+               "been in the same congruence class or reachable through"
+               "a single argument phi");
     } else if (auto *FirstMP = dyn_cast<MemoryPhi>(KV.first)) {
-
       // We can only sanely verify that MemoryDefs in the operand list all have
       // the same class.
       auto ReachableOperandPred = [&](const Use &U) {
-        return ReachableBlocks.count(FirstMP->getIncomingBlock(U)) &&
+        return ReachableEdges.count(
+                   {FirstMP->getIncomingBlock(U), FirstMP->getBlock()}) &&
                isa<MemoryDef>(U);
 
       };
@@ -1622,19 +2565,127 @@ void NewGVN::verifyMemoryCongruency() const {
              "All MemoryPhi arguments should be in the same class");
     }
   }
+#endif
+}
+
+// Verify that the sparse propagation we did actually found the maximal fixpoint
+// We do this by storing the value to class mapping, touching all instructions,
+// and redoing the iteration to see if anything changed.
+void NewGVN::verifyIterationSettled(Function &F) {
+#ifndef NDEBUG
+  DEBUG(dbgs() << "Beginning iteration verification\n");
+  if (DebugCounter::isCounterSet(VNCounter))
+    DebugCounter::setCounterValue(VNCounter, StartingVNCounter);
+
+  // Note that we have to store the actual classes, as we may change existing
+  // classes during iteration.  This is because our memory iteration propagation
+  // is not perfect, and so may waste a little work.  But it should generate
+  // exactly the same congruence classes we have now, with different IDs.
+  std::map<const Value *, CongruenceClass> BeforeIteration;
+
+  for (auto &KV : ValueToClass) {
+    if (auto *I = dyn_cast<Instruction>(KV.first))
+      // Skip unused/dead instructions.
+      if (InstrToDFSNum(I) == 0)
+        continue;
+    BeforeIteration.insert({KV.first, *KV.second});
+  }
+
+  TouchedInstructions.set();
+  TouchedInstructions.reset(0);
+  iterateTouchedInstructions();
+  DenseSet<std::pair<const CongruenceClass *, const CongruenceClass *>>
+      EqualClasses;
+  for (const auto &KV : ValueToClass) {
+    if (auto *I = dyn_cast<Instruction>(KV.first))
+      // Skip unused/dead instructions.
+      if (InstrToDFSNum(I) == 0)
+        continue;
+    // We could sink these uses, but i think this adds a bit of clarity here as
+    // to what we are comparing.
+    auto *BeforeCC = &BeforeIteration.find(KV.first)->second;
+    auto *AfterCC = KV.second;
+    // Note that the classes can't change at this point, so we memoize the set
+    // that are equal.
+    if (!EqualClasses.count({BeforeCC, AfterCC})) {
+      assert(BeforeCC->isEquivalentTo(AfterCC) &&
+             "Value number changed after main loop completed!");
+      EqualClasses.insert({BeforeCC, AfterCC});
+    }
+  }
+#endif
+}
+
+// This is the main value numbering loop, it iterates over the initial touched
+// instruction set, propagating value numbers, marking things touched, etc,
+// until the set of touched instructions is completely empty.
+void NewGVN::iterateTouchedInstructions() {
+  unsigned int Iterations = 0;
+  // Figure out where touchedinstructions starts
+  int FirstInstr = TouchedInstructions.find_first();
+  // Nothing set, nothing to iterate, just return.
+  if (FirstInstr == -1)
+    return;
+  BasicBlock *LastBlock = getBlockForValue(InstrFromDFSNum(FirstInstr));
+  while (TouchedInstructions.any()) {
+    ++Iterations;
+    // Walk through all the instructions in all the blocks in RPO.
+    // TODO: As we hit a new block, we should push and pop equalities into a
+    // table lookupOperandLeader can use, to catch things PredicateInfo
+    // might miss, like edge-only equivalences.
+    for (int InstrNum = TouchedInstructions.find_first(); InstrNum != -1;
+         InstrNum = TouchedInstructions.find_next(InstrNum)) {
+
+      // This instruction was found to be dead. We don't bother looking
+      // at it again.
+      if (InstrNum == 0) {
+        TouchedInstructions.reset(InstrNum);
+        continue;
+      }
+
+      Value *V = InstrFromDFSNum(InstrNum);
+      BasicBlock *CurrBlock = getBlockForValue(V);
+
+      // If we hit a new block, do reachability processing.
+      if (CurrBlock != LastBlock) {
+        LastBlock = CurrBlock;
+        bool BlockReachable = ReachableBlocks.count(CurrBlock);
+        const auto &CurrInstRange = BlockInstRange.lookup(CurrBlock);
+
+        // If it's not reachable, erase any touched instructions and move on.
+        if (!BlockReachable) {
+          TouchedInstructions.reset(CurrInstRange.first, CurrInstRange.second);
+          DEBUG(dbgs() << "Skipping instructions in block "
+                       << getBlockName(CurrBlock)
+                       << " because it is unreachable\n");
+          continue;
+        }
+        updateProcessedCount(CurrBlock);
+      }
+
+      if (auto *MP = dyn_cast<MemoryPhi>(V)) {
+        DEBUG(dbgs() << "Processing MemoryPhi " << *MP << "\n");
+        valueNumberMemoryPhi(MP);
+      } else if (auto *I = dyn_cast<Instruction>(V)) {
+        valueNumberInstruction(I);
+      } else {
+        llvm_unreachable("Should have been a MemoryPhi or Instruction");
+      }
+      updateProcessedCount(V);
+      // Reset after processing (because we may mark ourselves as touched when
+      // we propagate equalities).
+      TouchedInstructions.reset(InstrNum);
+    }
+  }
+  NumGVNMaxIterations = std::max(NumGVNMaxIterations.getValue(), Iterations);
 }
 
 // This is the main transformation entry point.
-bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
-                    TargetLibraryInfo *_TLI, AliasAnalysis *_AA,
-                    MemorySSA *_MSSA) {
+bool NewGVN::runGVN() {
+  if (DebugCounter::isCounterSet(VNCounter))
+    StartingVNCounter = DebugCounter::getCounterValue(VNCounter);
   bool Changed = false;
-  DT = _DT;
-  AC = _AC;
-  TLI = _TLI;
-  AA = _AA;
-  MSSA = _MSSA;
-  DL = &F.getParent()->getDataLayout();
+  NumFuncArgs = F.arg_size();
   MSSAWalker = MSSA->getWalker();
 
   // Count number of instructions for sizing of hash tables, and come
@@ -1642,15 +2693,14 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
   unsigned ICount = 1;
   // Add an empty instruction to account for the fact that we start at 1
   DFSToInstr.emplace_back(nullptr);
-  // Note: We want RPO traversal of the blocks, which is not quite the same as
-  // dominator tree order, particularly with regard whether backedges get
-  // visited first or second, given a block with multiple successors.
+  // Note: We want ideal RPO traversal of the blocks, which is not quite the
+  // same as dominator tree order, particularly with regard whether backedges
+  // get visited first or second, given a block with multiple successors.
   // If we visit in the wrong order, we will end up performing N times as many
   // iterations.
   // The dominator tree does guarantee that, for a given dom tree node, it's
   // parent must occur before it in the RPO ordering. Thus, we only need to sort
   // the siblings.
-  DenseMap<const DomTreeNode *, unsigned> RPOOrdering;
   ReversePostOrderTraversal<Function *> RPOT(&F);
   unsigned Counter = 0;
   for (auto &B : RPOT) {
@@ -1663,7 +2713,7 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
     auto *Node = DT->getNode(B);
     if (Node->getChildren().size() > 1)
       std::sort(Node->begin(), Node->end(),
-                [&RPOOrdering](const DomTreeNode *A, const DomTreeNode *B) {
+                [&](const DomTreeNode *A, const DomTreeNode *B) {
                   return RPOOrdering[A] < RPOOrdering[B];
                 });
   }
@@ -1689,7 +2739,6 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
   }
 
   TouchedInstructions.resize(ICount);
-  DominatedInstRange.reserve(F.size());
   // Ensure we don't end up resizing the expressionToClass map, as
   // that can be quite expensive. At most, we have one expression per
   // instruction.
@@ -1701,62 +2750,10 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
   ReachableBlocks.insert(&F.getEntryBlock());
 
   initializeCongruenceClasses(F);
-
-  unsigned int Iterations = 0;
-  // We start out in the entry block.
-  BasicBlock *LastBlock = &F.getEntryBlock();
-  while (TouchedInstructions.any()) {
-    ++Iterations;
-    // Walk through all the instructions in all the blocks in RPO.
-    for (int InstrNum = TouchedInstructions.find_first(); InstrNum != -1;
-         InstrNum = TouchedInstructions.find_next(InstrNum)) {
-      assert(InstrNum != 0 && "Bit 0 should never be set, something touched an "
-                              "instruction not in the lookup table");
-      Value *V = DFSToInstr[InstrNum];
-      BasicBlock *CurrBlock = nullptr;
-
-      if (auto *I = dyn_cast<Instruction>(V))
-        CurrBlock = I->getParent();
-      else if (auto *MP = dyn_cast<MemoryPhi>(V))
-        CurrBlock = MP->getBlock();
-      else
-        llvm_unreachable("DFSToInstr gave us an unknown type of instruction");
-
-      // If we hit a new block, do reachability processing.
-      if (CurrBlock != LastBlock) {
-        LastBlock = CurrBlock;
-        bool BlockReachable = ReachableBlocks.count(CurrBlock);
-        const auto &CurrInstRange = BlockInstRange.lookup(CurrBlock);
-
-        // If it's not reachable, erase any touched instructions and move on.
-        if (!BlockReachable) {
-          TouchedInstructions.reset(CurrInstRange.first, CurrInstRange.second);
-          DEBUG(dbgs() << "Skipping instructions in block "
-                       << getBlockName(CurrBlock)
-                       << " because it is unreachable\n");
-          continue;
-        }
-        updateProcessedCount(CurrBlock);
-      }
-
-      if (auto *MP = dyn_cast<MemoryPhi>(V)) {
-        DEBUG(dbgs() << "Processing MemoryPhi " << *MP << "\n");
-        valueNumberMemoryPhi(MP);
-      } else if (auto *I = dyn_cast<Instruction>(V)) {
-        valueNumberInstruction(I);
-      } else {
-        llvm_unreachable("Should have been a MemoryPhi or Instruction");
-      }
-      updateProcessedCount(V);
-      // Reset after processing (because we may mark ourselves as touched when
-      // we propagate equalities).
-      TouchedInstructions.reset(InstrNum);
-    }
-  }
-  NumGVNMaxIterations = std::max(NumGVNMaxIterations.getValue(), Iterations);
-#ifndef NDEBUG
+  iterateTouchedInstructions();
   verifyMemoryCongruency();
-#endif
+  verifyIterationSettled(F);
+
   Changed |= eliminateInstructions(F);
 
   // Delete all instructions marked for deletion.
@@ -1783,36 +2780,6 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
   return Changed;
 }
 
-bool NewGVN::runOnFunction(Function &F) {
-  if (skipFunction(F))
-    return false;
-  return runGVN(F, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
-                &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
-                &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
-                &getAnalysis<AAResultsWrapperPass>().getAAResults(),
-                &getAnalysis<MemorySSAWrapperPass>().getMSSA());
-}
-
-PreservedAnalyses NewGVNPass::run(Function &F, AnalysisManager<Function> &AM) {
-  NewGVN Impl;
-
-  // Apparently the order in which we get these results matter for
-  // the old GVN (see Chandler's comment in GVN.cpp). I'll keep
-  // the same order here, just in case.
-  auto &AC = AM.getResult<AssumptionAnalysis>(F);
-  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
-  auto &AA = AM.getResult<AAManager>(F);
-  auto &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
-  bool Changed = Impl.runGVN(F, &DT, &AC, &TLI, &AA, &MSSA);
-  if (!Changed)
-    return PreservedAnalyses::all();
-  PreservedAnalyses PA;
-  PA.preserve<DominatorTreeAnalysis>();
-  PA.preserve<GlobalsAA>();
-  return PA;
-}
-
 // Return true if V is a value that will always be available (IE can
 // be placed anywhere) in the function.  We don't do globals here
 // because they are often worse to put in place.
@@ -1821,21 +2788,15 @@ static bool alwaysAvailable(Value *V) {
   return isa<Constant>(V) || isa<Argument>(V);
 }
 
-// Get the basic block from an instruction/value.
-static BasicBlock *getBlockForValue(Value *V) {
-  if (auto *I = dyn_cast<Instruction>(V))
-    return I->getParent();
-  return nullptr;
-}
-
 struct NewGVN::ValueDFS {
   int DFSIn = 0;
   int DFSOut = 0;
   int LocalNum = 0;
-  // Only one of these will be set.
-  Value *Val = nullptr;
+  // Only one of Def and U will be set.
+  // The bool in the Def tells us whether the Def is the stored value of a
+  // store.
+  PointerIntPair<Value *, 1, bool> Def;
   Use *U = nullptr;
-
   bool operator<(const ValueDFS &Other) const {
     // It's not enough that any given field be less than - we have sets
     // of fields that need to be evaluated together to give a proper ordering.
@@ -1875,89 +2836,151 @@ struct NewGVN::ValueDFS {
     // but .val  and .u.
     // It does not matter what order we replace these operands in.
     // You will always end up with the same IR, and this is guaranteed.
-    return std::tie(DFSIn, DFSOut, LocalNum, Val, U) <
-           std::tie(Other.DFSIn, Other.DFSOut, Other.LocalNum, Other.Val,
+    return std::tie(DFSIn, DFSOut, LocalNum, Def, U) <
+           std::tie(Other.DFSIn, Other.DFSOut, Other.LocalNum, Other.Def,
                     Other.U);
   }
 };
 
-void NewGVN::convertDenseToDFSOrdered(
-    CongruenceClass::MemberSet &Dense,
-    SmallVectorImpl<ValueDFS> &DFSOrderedSet) {
+// This function converts the set of members for a congruence class from values,
+// to sets of defs and uses with associated DFS info.  The total number of
+// reachable uses for each value is stored in UseCount, and instructions that
+// seem
+// dead (have no non-dead uses) are stored in ProbablyDead.
+void NewGVN::convertClassToDFSOrdered(
+    const CongruenceClass &Dense, SmallVectorImpl<ValueDFS> &DFSOrderedSet,
+    DenseMap<const Value *, unsigned int> &UseCounts,
+    SmallPtrSetImpl<Instruction *> &ProbablyDead) const {
   for (auto D : Dense) {
     // First add the value.
     BasicBlock *BB = getBlockForValue(D);
     // Constants are handled prior to ever calling this function, so
     // we should only be left with instructions as members.
     assert(BB && "Should have figured out a basic block for value");
-    ValueDFS VD;
-
-    std::pair<int, int> DFSPair = DFSDomMap[BB];
-    assert(DFSPair.first != -1 && DFSPair.second != -1 && "Invalid DFS Pair");
-    VD.DFSIn = DFSPair.first;
-    VD.DFSOut = DFSPair.second;
-    VD.Val = D;
-    // If it's an instruction, use the real local dfs number.
-    if (auto *I = dyn_cast<Instruction>(D))
-      VD.LocalNum = InstrDFS[I];
-    else
-      llvm_unreachable("Should have been an instruction");
-
-    DFSOrderedSet.emplace_back(VD);
-
-    // Now add the users.
-    for (auto &U : D->uses()) {
+    ValueDFS VDDef;
+    DomTreeNode *DomNode = DT->getNode(BB);
+    VDDef.DFSIn = DomNode->getDFSNumIn();
+    VDDef.DFSOut = DomNode->getDFSNumOut();
+    // If it's a store, use the leader of the value operand, if it's always
+    // available, or the value operand.  TODO: We could do dominance checks to
+    // find a dominating leader, but not worth it ATM.
+    if (auto *SI = dyn_cast<StoreInst>(D)) {
+      auto Leader = lookupOperandLeader(SI->getValueOperand());
+      if (alwaysAvailable(Leader)) {
+        VDDef.Def.setPointer(Leader);
+      } else {
+        VDDef.Def.setPointer(SI->getValueOperand());
+        VDDef.Def.setInt(true);
+      }
+    } else {
+      VDDef.Def.setPointer(D);
+    }
+    assert(isa<Instruction>(D) &&
+           "The dense set member should always be an instruction");
+    VDDef.LocalNum = InstrToDFSNum(D);
+    DFSOrderedSet.emplace_back(VDDef);
+    Instruction *Def = cast<Instruction>(D);
+    unsigned int UseCount = 0;
+    // Now add the uses.
+    for (auto &U : Def->uses()) {
       if (auto *I = dyn_cast<Instruction>(U.getUser())) {
-        ValueDFS VD;
+        // Don't try to replace into dead uses
+        if (InstructionsToErase.count(I))
+          continue;
+        ValueDFS VDUse;
         // Put the phi node uses in the incoming block.
         BasicBlock *IBlock;
         if (auto *P = dyn_cast<PHINode>(I)) {
           IBlock = P->getIncomingBlock(U);
           // Make phi node users appear last in the incoming block
           // they are from.
-          VD.LocalNum = InstrDFS.size() + 1;
+          VDUse.LocalNum = InstrDFS.size() + 1;
         } else {
           IBlock = I->getParent();
-          VD.LocalNum = InstrDFS[I];
+          VDUse.LocalNum = InstrToDFSNum(I);
         }
-        std::pair<int, int> DFSPair = DFSDomMap[IBlock];
-        VD.DFSIn = DFSPair.first;
-        VD.DFSOut = DFSPair.second;
-        VD.U = &U;
-        DFSOrderedSet.emplace_back(VD);
+
+        // Skip uses in unreachable blocks, as we're going
+        // to delete them.
+        if (ReachableBlocks.count(IBlock) == 0)
+          continue;
+
+        DomTreeNode *DomNode = DT->getNode(IBlock);
+        VDUse.DFSIn = DomNode->getDFSNumIn();
+        VDUse.DFSOut = DomNode->getDFSNumOut();
+        VDUse.U = &U;
+        ++UseCount;
+        DFSOrderedSet.emplace_back(VDUse);
       }
     }
+
+    // If there are no uses, it's probably dead (but it may have side-effects,
+    // so not definitely dead. Otherwise, store the number of uses so we can
+    // track if it becomes dead later).
+    if (UseCount == 0)
+      ProbablyDead.insert(Def);
+    else
+      UseCounts[Def] = UseCount;
   }
 }
 
-static void patchReplacementInstruction(Instruction *I, Value *Repl) {
-  // Patch the replacement so that it is not more restrictive than the value
-  // being replaced.
-  auto *Op = dyn_cast<BinaryOperator>(I);
-  auto *ReplOp = dyn_cast<BinaryOperator>(Repl);
+// This function converts the set of members for a congruence class from values,
+// to the set of defs for loads and stores, with associated DFS info.
+void NewGVN::convertClassToLoadsAndStores(
+    const CongruenceClass &Dense,
+    SmallVectorImpl<ValueDFS> &LoadsAndStores) const {
+  for (auto D : Dense) {
+    if (!isa<LoadInst>(D) && !isa<StoreInst>(D))
+      continue;
 
-  if (Op && ReplOp)
-    ReplOp->andIRFlags(Op);
+    BasicBlock *BB = getBlockForValue(D);
+    ValueDFS VD;
+    DomTreeNode *DomNode = DT->getNode(BB);
+    VD.DFSIn = DomNode->getDFSNumIn();
+    VD.DFSOut = DomNode->getDFSNumOut();
+    VD.Def.setPointer(D);
 
-  if (auto *ReplInst = dyn_cast<Instruction>(Repl)) {
-    // FIXME: If both the original and replacement value are part of the
-    // same control-flow region (meaning that the execution of one
-    // guarentees the executation of the other), then we can combine the
-    // noalias scopes here and do better than the general conservative
-    // answer used in combineMetadata().
+    // If it's an instruction, use the real local dfs number.
+    if (auto *I = dyn_cast<Instruction>(D))
+      VD.LocalNum = InstrToDFSNum(I);
+    else
+      llvm_unreachable("Should have been an instruction");
 
-    // In general, GVN unifies expressions over different control-flow
-    // regions, and so we need a conservative combination of the noalias
-    // scopes.
-    unsigned KnownIDs[] = {
-        LLVMContext::MD_tbaa,           LLVMContext::MD_alias_scope,
-        LLVMContext::MD_noalias,        LLVMContext::MD_range,
-        LLVMContext::MD_fpmath,         LLVMContext::MD_invariant_load,
-        LLVMContext::MD_invariant_group};
-    combineMetadata(ReplInst, I, KnownIDs);
+    LoadsAndStores.emplace_back(VD);
   }
 }
 
+static void patchReplacementInstruction(Instruction *I, Value *Repl) {
+  auto *ReplInst = dyn_cast<Instruction>(Repl);
+  if (!ReplInst)
+    return;
+
+  // Patch the replacement so that it is not more restrictive than the value
+  // being replaced.
+  // Note that if 'I' is a load being replaced by some operation,
+  // for example, by an arithmetic operation, then andIRFlags()
+  // would just erase all math flags from the original arithmetic
+  // operation, which is clearly not wanted and not needed.
+  if (!isa<LoadInst>(I))
+    ReplInst->andIRFlags(I);
+
+  // FIXME: If both the original and replacement value are part of the
+  // same control-flow region (meaning that the execution of one
+  // guarantees the execution of the other), then we can combine the
+  // noalias scopes here and do better than the general conservative
+  // answer used in combineMetadata().
+
+  // In general, GVN unifies expressions over different control-flow
+  // regions, and so we need a conservative combination of the noalias
+  // scopes.
+  static const unsigned KnownIDs[] = {
+      LLVMContext::MD_tbaa,           LLVMContext::MD_alias_scope,
+      LLVMContext::MD_noalias,        LLVMContext::MD_range,
+      LLVMContext::MD_fpmath,         LLVMContext::MD_invariant_load,
+      LLVMContext::MD_invariant_group};
+  combineMetadata(ReplInst, I, KnownIDs);
+}
+
 static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
   patchReplacementInstruction(I, Repl);
   I->replaceAllUsesWith(Repl);
@@ -1967,10 +2990,6 @@ void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) {
   DEBUG(dbgs() << "  BasicBlock Dead:" << *BB);
   ++NumGVNBlocksDeleted;
 
-  // Check to see if there are non-terminating instructions to delete.
-  if (isa<TerminatorInst>(BB->begin()))
-    return;
-
   // Delete the instructions backwards, as it has a reduced likelihood of having
   // to update as many def-use and use-def chains. Start after the terminator.
   auto StartPoint = BB->rbegin();
@@ -1987,6 +3006,11 @@ void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) {
     Inst.eraseFromParent();
     ++NumGVNInstrDeleted;
   }
+  // Now insert something that simplifycfg will turn into an unreachable.
+  Type *Int8Ty = Type::getInt8Ty(BB->getContext());
+  new StoreInst(UndefValue::get(Int8Ty),
+                Constant::getNullValue(Int8Ty->getPointerTo()),
+                BB->getTerminator());
 }
 
 void NewGVN::markInstructionForDeletion(Instruction *I) {
@@ -2086,59 +3110,59 @@ bool NewGVN::eliminateInstructions(Function &F) {
         }
       }
     }
-    DomTreeNode *Node = DT->getNode(&B);
-    if (Node)
-      DFSDomMap[&B] = {Node->getDFSNumIn(), Node->getDFSNumOut()};
   }
 
-  for (CongruenceClass *CC : CongruenceClasses) {
-    // FIXME: We should eventually be able to replace everything still
-    // in the initial class with undef, as they should be unreachable.
-    // Right now, initial still contains some things we skip value
-    // numbering of (UNREACHABLE's, for example).
-    if (CC == InitialClass || CC->Dead)
+  // Map to store the use counts
+  DenseMap<const Value *, unsigned int> UseCounts;
+  for (CongruenceClass *CC : reverse(CongruenceClasses)) {
+    // Track the equivalent store info so we can decide whether to try
+    // dead store elimination.
+    SmallVector<ValueDFS, 8> PossibleDeadStores;
+    SmallPtrSet<Instruction *, 8> ProbablyDead;
+    if (CC->isDead() || CC->empty())
       continue;
-    assert(CC->RepLeader && "We should have had a leader");
+    // Everything still in the TOP class is unreachable or dead.
+    if (CC == TOPClass) {
+#ifndef NDEBUG
+      for (auto M : *CC)
+        assert((!ReachableBlocks.count(cast<Instruction>(M)->getParent()) ||
+                InstructionsToErase.count(cast<Instruction>(M))) &&
+               "Everything in TOP should be unreachable or dead at this "
+               "point");
+#endif
+      continue;
+    }
 
+    assert(CC->getLeader() && "We should have had a leader");
     // If this is a leader that is always available, and it's a
     // constant or has no equivalences, just replace everything with
     // it. We then update the congruence class with whatever members
     // are left.
-    if (alwaysAvailable(CC->RepLeader)) {
-      SmallPtrSet<Value *, 4> MembersLeft;
-      for (auto M : CC->Members) {
-
+    Value *Leader =
+        CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader();
+    if (alwaysAvailable(Leader)) {
+      CongruenceClass::MemberSet MembersLeft;
+      for (auto M : *CC) {
         Value *Member = M;
-
         // Void things have no uses we can replace.
-        if (Member == CC->RepLeader || Member->getType()->isVoidTy()) {
+        if (Member == Leader || !isa<Instruction>(Member) ||
+            Member->getType()->isVoidTy()) {
           MembersLeft.insert(Member);
           continue;
         }
-
-        DEBUG(dbgs() << "Found replacement " << *(CC->RepLeader) << " for "
-                     << *Member << "\n");
-        // Due to equality propagation, these may not always be
-        // instructions, they may be real values.  We don't really
-        // care about trying to replace the non-instructions.
-        if (auto *I = dyn_cast<Instruction>(Member)) {
-          assert(CC->RepLeader != I &&
-                 "About to accidentally remove our leader");
-          replaceInstruction(I, CC->RepLeader);
-          AnythingReplaced = true;
-
-          continue;
-        } else {
-          MembersLeft.insert(I);
-        }
+        DEBUG(dbgs() << "Found replacement " << *(Leader) << " for " << *Member
+                     << "\n");
+        auto *I = cast<Instruction>(Member);
+        assert(Leader != I && "About to accidentally remove our leader");
+        replaceInstruction(I, Leader);
+        AnythingReplaced = true;
       }
-      CC->Members.swap(MembersLeft);
-
+      CC->swap(MembersLeft);
     } else {
-      DEBUG(dbgs() << "Eliminating in congruence class " << CC->ID << "\n");
+      DEBUG(dbgs() << "Eliminating in congruence class " << CC->getID()
+                   << "\n");
       // If this is a singleton, we can skip it.
-      if (CC->Members.size() != 1) {
-
+      if (CC->size() != 1) {
         // This is a stack because equality replacement/etc may place
         // constants in the middle of the member list, and we want to use
         // those constant values in preference to the current leader, over
@@ -2147,24 +3171,19 @@ bool NewGVN::eliminateInstructions(Function &F) {
 
         // Convert the members to DFS ordered sets and then merge them.
         SmallVector<ValueDFS, 8> DFSOrderedSet;
-        convertDenseToDFSOrdered(CC->Members, DFSOrderedSet);
+        convertClassToDFSOrdered(*CC, DFSOrderedSet, UseCounts, ProbablyDead);
 
         // Sort the whole thing.
         std::sort(DFSOrderedSet.begin(), DFSOrderedSet.end());
-
         for (auto &VD : DFSOrderedSet) {
           int MemberDFSIn = VD.DFSIn;
           int MemberDFSOut = VD.DFSOut;
-          Value *Member = VD.Val;
-          Use *MemberUse = VD.U;
-
-          if (Member) {
-            // We ignore void things because we can't get a value from them.
-            // FIXME: We could actually use this to kill dead stores that are
-            // dominated by equivalent earlier stores.
-            if (Member->getType()->isVoidTy())
-              continue;
-          }
+          Value *Def = VD.Def.getPointer();
+          bool FromStore = VD.Def.getInt();
+          Use *U = VD.U;
+          // We ignore void things because we can't get a value from them.
+          if (Def && Def->getType()->isVoidTy())
+            continue;
 
           if (EliminationStack.empty()) {
             DEBUG(dbgs() << "Elimination Stack is empty\n");
@@ -2189,69 +3208,240 @@ bool NewGVN::eliminateInstructions(Function &F) {
           // start using, we also push.
           // Otherwise, we walk along, processing members who are
           // dominated by this scope, and eliminate them.
-          bool ShouldPush =
-              Member && (EliminationStack.empty() || isa<Constant>(Member));
+          bool ShouldPush = Def && EliminationStack.empty();
           bool OutOfScope =
               !EliminationStack.isInScope(MemberDFSIn, MemberDFSOut);
 
           if (OutOfScope || ShouldPush) {
             // Sync to our current scope.
             EliminationStack.popUntilDFSScope(MemberDFSIn, MemberDFSOut);
-            ShouldPush |= Member && EliminationStack.empty();
+            bool ShouldPush = Def && EliminationStack.empty();
             if (ShouldPush) {
-              EliminationStack.push_back(Member, MemberDFSIn, MemberDFSOut);
+              EliminationStack.push_back(Def, MemberDFSIn, MemberDFSOut);
+            }
+          }
+
+          // Skip the Def's, we only want to eliminate on their uses.  But mark
+          // dominated defs as dead.
+          if (Def) {
+            // For anything in this case, what and how we value number
+            // guarantees that any side-effets that would have occurred (ie
+            // throwing, etc) can be proven to either still occur (because it's
+            // dominated by something that has the same side-effects), or never
+            // occur.  Otherwise, we would not have been able to prove it value
+            // equivalent to something else. For these things, we can just mark
+            // it all dead.  Note that this is different from the "ProbablyDead"
+            // set, which may not be dominated by anything, and thus, are only
+            // easy to prove dead if they are also side-effect free. Note that
+            // because stores are put in terms of the stored value, we skip
+            // stored values here. If the stored value is really dead, it will
+            // still be marked for deletion when we process it in its own class.
+            if (!EliminationStack.empty() && Def != EliminationStack.back() &&
+                isa<Instruction>(Def) && !FromStore)
+              markInstructionForDeletion(cast<Instruction>(Def));
+            continue;
+          }
+          // At this point, we know it is a Use we are trying to possibly
+          // replace.
+
+          assert(isa<Instruction>(U->get()) &&
+                 "Current def should have been an instruction");
+          assert(isa<Instruction>(U->getUser()) &&
+                 "Current user should have been an instruction");
+
+          // If the thing we are replacing into is already marked to be dead,
+          // this use is dead.  Note that this is true regardless of whether
+          // we have anything dominating the use or not.  We do this here
+          // because we are already walking all the uses anyway.
+          Instruction *InstUse = cast<Instruction>(U->getUser());
+          if (InstructionsToErase.count(InstUse)) {
+            auto &UseCount = UseCounts[U->get()];
+            if (--UseCount == 0) {
+              ProbablyDead.insert(cast<Instruction>(U->get()));
             }
           }
 
           // If we get to this point, and the stack is empty we must have a use
-          // with nothing we can use to eliminate it, just skip it.
+          // with nothing we can use to eliminate this use, so just skip it.
           if (EliminationStack.empty())
             continue;
 
-          // Skip the Value's, we only want to eliminate on their uses.
-          if (Member)
-            continue;
-          Value *Result = EliminationStack.back();
+          Value *DominatingLeader = EliminationStack.back();
 
           // Don't replace our existing users with ourselves.
-          if (MemberUse->get() == Result)
+          if (U->get() == DominatingLeader)
             continue;
-
-          DEBUG(dbgs() << "Found replacement " << *Result << " for "
-                       << *MemberUse->get() << " in " << *(MemberUse->getUser())
-                       << "\n");
+          DEBUG(dbgs() << "Found replacement " << *DominatingLeader << " for "
+                       << *U->get() << " in " << *(U->getUser()) << "\n");
 
           // If we replaced something in an instruction, handle the patching of
-          // metadata.
-          if (auto *ReplacedInst = dyn_cast<Instruction>(MemberUse->get()))
-            patchReplacementInstruction(ReplacedInst, Result);
-
-          assert(isa<Instruction>(MemberUse->getUser()));
-          MemberUse->set(Result);
+          // metadata.  Skip this if we are replacing predicateinfo with its
+          // original operand, as we already know we can just drop it.
+          auto *ReplacedInst = cast<Instruction>(U->get());
+          auto *PI = PredInfo->getPredicateInfoFor(ReplacedInst);
+          if (!PI || DominatingLeader != PI->OriginalOp)
+            patchReplacementInstruction(ReplacedInst, DominatingLeader);
+          U->set(DominatingLeader);
+          // This is now a use of the dominating leader, which means if the
+          // dominating leader was dead, it's now live!
+          auto &LeaderUseCount = UseCounts[DominatingLeader];
+          // It's about to be alive again.
+          if (LeaderUseCount == 0 && isa<Instruction>(DominatingLeader))
+            ProbablyDead.erase(cast<Instruction>(DominatingLeader));
+          ++LeaderUseCount;
           AnythingReplaced = true;
         }
       }
     }
 
+    // At this point, anything still in the ProbablyDead set is actually dead if
+    // would be trivially dead.
+    for (auto *I : ProbablyDead)
+      if (wouldInstructionBeTriviallyDead(I))
+        markInstructionForDeletion(I);
+
     // Cleanup the congruence class.
-    SmallPtrSet<Value *, 4> MembersLeft;
-    for (Value *Member : CC->Members) {
-      if (Member->getType()->isVoidTy()) {
+    CongruenceClass::MemberSet MembersLeft;
+    for (auto *Member : *CC)
+      if (!isa<Instruction>(Member) ||
+          !InstructionsToErase.count(cast<Instruction>(Member)))
         MembersLeft.insert(Member);
-        continue;
-      }
-
-      if (auto *MemberInst = dyn_cast<Instruction>(Member)) {
-        if (isInstructionTriviallyDead(MemberInst)) {
-          // TODO: Don't mark loads of undefs.
-          markInstructionForDeletion(MemberInst);
-          continue;
+    CC->swap(MembersLeft);
+
+    // If we have possible dead stores to look at, try to eliminate them.
+    if (CC->getStoreCount() > 0) {
+      convertClassToLoadsAndStores(*CC, PossibleDeadStores);
+      std::sort(PossibleDeadStores.begin(), PossibleDeadStores.end());
+      ValueDFSStack EliminationStack;
+      for (auto &VD : PossibleDeadStores) {
+        int MemberDFSIn = VD.DFSIn;
+        int MemberDFSOut = VD.DFSOut;
+        Instruction *Member = cast<Instruction>(VD.Def.getPointer());
+        if (EliminationStack.empty() ||
+            !EliminationStack.isInScope(MemberDFSIn, MemberDFSOut)) {
+          // Sync to our current scope.
+          EliminationStack.popUntilDFSScope(MemberDFSIn, MemberDFSOut);
+          if (EliminationStack.empty()) {
+            EliminationStack.push_back(Member, MemberDFSIn, MemberDFSOut);
+            continue;
+          }
         }
+        // We already did load elimination, so nothing to do here.
+        if (isa<LoadInst>(Member))
+          continue;
+        assert(!EliminationStack.empty());
+        Instruction *Leader = cast<Instruction>(EliminationStack.back());
+        (void)Leader;
+        assert(DT->dominates(Leader->getParent(), Member->getParent()));
+        // Member is dominater by Leader, and thus dead
+        DEBUG(dbgs() << "Marking dead store " << *Member
+                     << " that is dominated by " << *Leader << "\n");
+        markInstructionForDeletion(Member);
+        CC->erase(Member);
+        ++NumGVNDeadStores;
       }
-      MembersLeft.insert(Member);
     }
-    CC->Members.swap(MembersLeft);
   }
 
   return AnythingReplaced;
 }
+
+// This function provides global ranking of operations so that we can place them
+// in a canonical order.  Note that rank alone is not necessarily enough for a
+// complete ordering, as constants all have the same rank.  However, generally,
+// we will simplify an operation with all constants so that it doesn't matter
+// what order they appear in.
+unsigned int NewGVN::getRank(const Value *V) const {
+  // Prefer undef to anything else
+  if (isa<UndefValue>(V))
+    return 0;
+  if (isa<Constant>(V))
+    return 1;
+  else if (auto *A = dyn_cast<Argument>(V))
+    return 2 + A->getArgNo();
+
+  // Need to shift the instruction DFS by number of arguments + 3 to account for
+  // the constant and argument ranking above.
+  unsigned Result = InstrToDFSNum(V);
+  if (Result > 0)
+    return 3 + NumFuncArgs + Result;
+  // Unreachable or something else, just return a really large number.
+  return ~0;
+}
+
+// This is a function that says whether two commutative operations should
+// have their order swapped when canonicalizing.
+bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const {
+  // Because we only care about a total ordering, and don't rewrite expressions
+  // in this order, we order by rank, which will give a strict weak ordering to
+  // everything but constants, and then we order by pointer address.
+  return std::make_pair(getRank(A), A) > std::make_pair(getRank(B), B);
+}
+
+class NewGVNLegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid.
+  NewGVNLegacyPass() : FunctionPass(ID) {
+    initializeNewGVNLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override;
+
+private:
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<MemorySSAWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+
+bool NewGVNLegacyPass::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+  return NewGVN(F, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+                &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
+                &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+                &getAnalysis<AAResultsWrapperPass>().getAAResults(),
+                &getAnalysis<MemorySSAWrapperPass>().getMSSA(),
+                F.getParent()->getDataLayout())
+      .runGVN();
+}
+
+INITIALIZE_PASS_BEGIN(NewGVNLegacyPass, "newgvn", "Global Value Numbering",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(NewGVNLegacyPass, "newgvn", "Global Value Numbering", false,
+                    false)
+
+char NewGVNLegacyPass::ID = 0;
+
+// createGVNPass - The public interface to this file.
+FunctionPass *llvm::createNewGVNPass() { return new NewGVNLegacyPass(); }
+
+PreservedAnalyses NewGVNPass::run(Function &F, AnalysisManager<Function> &AM) {
+  // Apparently the order in which we get these results matter for
+  // the old GVN (see Chandler's comment in GVN.cpp). I'll keep
+  // the same order here, just in case.
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+  auto &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
+  bool Changed =
+      NewGVN(F, &DT, &AC, &TLI, &AA, &MSSA, F.getParent()->getDataLayout())
+          .runGVN();
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
index 1a7ddc9585ba..1bfecea2f61e 100644
--- a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -66,7 +66,7 @@ static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
   // Add attribute "readnone" so that backend can use a native sqrt instruction
   // for this call. Insert a FP compare instruction and a conditional branch
   // at the end of CurrBB.
-  Call->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
+  Call->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
   CurrBB.getTerminator()->eraseFromParent();
   Builder.SetInsertPoint(&CurrBB);
   Value *FCmp = Builder.CreateFCmpOEQ(Call, Call);
@@ -98,14 +98,14 @@ static bool runPartiallyInlineLibCalls(Function &F, TargetLibraryInfo *TLI,
 
       // Skip if function either has local linkage or is not a known library
       // function.
-      LibFunc::Func LibFunc;
+      LibFunc LF;
       if (CalledFunc->hasLocalLinkage() || !CalledFunc->hasName() ||
-          !TLI->getLibFunc(CalledFunc->getName(), LibFunc))
+          !TLI->getLibFunc(CalledFunc->getName(), LF))
         continue;
 
-      switch (LibFunc) {
-      case LibFunc::sqrtf:
-      case LibFunc::sqrt:
+      switch (LF) {
+      case LibFunc_sqrtf:
+      case LibFunc_sqrt:
         if (TTI->haveFastSqrt(Call->getType()) &&
             optimizeSQRT(Call, CalledFunc, *CurrBB, BB))
           break;
diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
index 65c814d7a63b..3dcab6090789 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -1069,8 +1069,7 @@ Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) {
 ///
 /// Ops is the top-level list of add operands we're trying to factor.
 static void FindSingleUseMultiplyFactors(Value *V,
-                                         SmallVectorImpl<Value*> &Factors,
-                                       const SmallVectorImpl<ValueEntry> &Ops) {
+                                         SmallVectorImpl<Value*> &Factors) {
   BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
   if (!BO) {
     Factors.push_back(V);
@@ -1078,8 +1077,8 @@ static void FindSingleUseMultiplyFactors(Value *V,
   }
 
   // Otherwise, add the LHS and RHS to the list of factors.
-  FindSingleUseMultiplyFactors(BO->getOperand(1), Factors, Ops);
-  FindSingleUseMultiplyFactors(BO->getOperand(0), Factors, Ops);
+  FindSingleUseMultiplyFactors(BO->getOperand(1), Factors);
+  FindSingleUseMultiplyFactors(BO->getOperand(0), Factors);
 }
 
 /// Optimize a series of operands to an 'and', 'or', or 'xor' instruction.
@@ -1499,7 +1498,7 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
 
     // Compute all of the factors of this added value.
     SmallVector<Value*, 8> Factors;
-    FindSingleUseMultiplyFactors(BOp, Factors, Ops);
+    FindSingleUseMultiplyFactors(BOp, Factors);
     assert(Factors.size() > 1 && "Bad linearize!");
 
     // Add one to FactorOccurrences for each unique factor in this op.
@@ -2236,8 +2235,8 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
   ValueRankMap.clear();
 
   if (MadeChange) {
-    // FIXME: This should also 'preserve the CFG'.
-    auto PA = PreservedAnalyses();
+    PreservedAnalyses PA;
+    PA.preserveSet<CFGAnalyses>();
     PA.preserve<GlobalsAA>();
     return PA;
   }
diff --git a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index 1de742050cb3..f344eb151464 100644
--- a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -365,6 +365,11 @@ findBaseDefiningValueOfVector(Value *I) {
     // for particular sufflevector patterns.
     return BaseDefiningValueResult(I, false);
 
+  // The behavior of getelementptr instructions is the same for vector and
+  // non-vector data types.
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
+    return findBaseDefiningValue(GEP->getPointerOperand());
+
   // A PHI or Select is a base defining value.  The outer findBasePointer
   // algorithm is responsible for constructing a base value for this BDV.
   assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&
@@ -634,7 +639,7 @@ static BDVState meetBDVStateImpl(const BDVState &LHS, const BDVState &RHS) {
 
 // Values of type BDVState form a lattice, and this function implements the meet
 // operation.
-static BDVState meetBDVState(BDVState LHS, BDVState RHS) {
+static BDVState meetBDVState(const BDVState &LHS, const BDVState &RHS) {
   BDVState Result = meetBDVStateImpl(LHS, RHS);
   assert(Result == meetBDVStateImpl(RHS, LHS) &&
          "Math is wrong: meet does not commute!");
@@ -1123,14 +1128,14 @@ normalizeForInvokeSafepoint(BasicBlock *BB, BasicBlock *InvokeParent,
 
 // Create new attribute set containing only attributes which can be transferred
 // from original call to the safepoint.
-static AttributeSet legalizeCallAttributes(AttributeSet AS) {
-  AttributeSet Ret;
+static AttributeList legalizeCallAttributes(AttributeList AS) {
+  AttributeList Ret;
 
   for (unsigned Slot = 0; Slot < AS.getNumSlots(); Slot++) {
     unsigned Index = AS.getSlotIndex(Slot);
 
-    if (Index == AttributeSet::ReturnIndex ||
-        Index == AttributeSet::FunctionIndex) {
+    if (Index == AttributeList::ReturnIndex ||
+        Index == AttributeList::FunctionIndex) {
 
       for (Attribute Attr : make_range(AS.begin(Slot), AS.end(Slot))) {
 
@@ -1148,7 +1153,7 @@ static AttributeSet legalizeCallAttributes(AttributeSet AS) {
 
         Ret = Ret.addAttributes(
             AS.getContext(), Index,
-            AttributeSet::get(AS.getContext(), Index, AttrBuilder(Attr)));
+            AttributeList::get(AS.getContext(), Index, AttrBuilder(Attr)));
       }
     }
 
@@ -1299,12 +1304,11 @@ static StringRef getDeoptLowering(CallSite CS) {
   const char *DeoptLowering = "deopt-lowering";
   if (CS.hasFnAttr(DeoptLowering)) {
     // FIXME: CallSite has a *really* confusing interface around attributes
-    // with values.  
-    const AttributeSet &CSAS = CS.getAttributes();
-    if (CSAS.hasAttribute(AttributeSet::FunctionIndex,
-                          DeoptLowering))
-      return CSAS.getAttribute(AttributeSet::FunctionIndex,
-                               DeoptLowering).getValueAsString();
+    // with values.
+    const AttributeList &CSAS = CS.getAttributes();
+    if (CSAS.hasAttribute(AttributeList::FunctionIndex, DeoptLowering))
+      return CSAS.getAttribute(AttributeList::FunctionIndex, DeoptLowering)
+          .getValueAsString();
     Function *F = CS.getCalledFunction();
     assert(F && F->hasFnAttribute(DeoptLowering));
     return F->getFnAttribute(DeoptLowering).getValueAsString();
@@ -1388,7 +1392,6 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */
 
   // Create the statepoint given all the arguments
   Instruction *Token = nullptr;
-  AttributeSet ReturnAttrs;
   if (CS.isCall()) {
     CallInst *ToReplace = cast<CallInst>(CS.getInstruction());
     CallInst *Call = Builder.CreateGCStatepointCall(
@@ -1400,11 +1403,12 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */
 
     // Currently we will fail on parameter attributes and on certain
     // function attributes.
-    AttributeSet NewAttrs = legalizeCallAttributes(ToReplace->getAttributes());
+    AttributeList NewAttrs = legalizeCallAttributes(ToReplace->getAttributes());
     // In case if we can handle this set of attributes - set up function attrs
     // directly on statepoint and return attrs later for gc_result intrinsic.
-    Call->setAttributes(NewAttrs.getFnAttributes());
-    ReturnAttrs = NewAttrs.getRetAttributes();
+    Call->setAttributes(AttributeList::get(Call->getContext(),
+                                           AttributeList::FunctionIndex,
+                                           NewAttrs.getFnAttributes()));
 
     Token = Call;
 
@@ -1428,11 +1432,12 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */
 
     // Currently we will fail on parameter attributes and on certain
     // function attributes.
-    AttributeSet NewAttrs = legalizeCallAttributes(ToReplace->getAttributes());
+    AttributeList NewAttrs = legalizeCallAttributes(ToReplace->getAttributes());
     // In case if we can handle this set of attributes - set up function attrs
     // directly on statepoint and return attrs later for gc_result intrinsic.
-    Invoke->setAttributes(NewAttrs.getFnAttributes());
-    ReturnAttrs = NewAttrs.getRetAttributes();
+    Invoke->setAttributes(AttributeList::get(Invoke->getContext(),
+                                             AttributeList::FunctionIndex,
+                                             NewAttrs.getFnAttributes()));
 
     Token = Invoke;
 
@@ -1478,7 +1483,9 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */
       StringRef Name =
           CS.getInstruction()->hasName() ? CS.getInstruction()->getName() : "";
       CallInst *GCResult = Builder.CreateGCResult(Token, CS.getType(), Name);
-      GCResult->setAttributes(CS.getAttributes().getRetAttributes());
+      GCResult->setAttributes(
+          AttributeList::get(GCResult->getContext(), AttributeList::ReturnIndex,
+                             CS.getAttributes().getRetAttributes()));
 
       // We cannot RAUW or delete CS.getInstruction() because it could be in the
       // live set of some other safepoint, in which case that safepoint's
@@ -1615,8 +1622,10 @@ static void relocationViaAlloca(
 
   // Emit alloca for "LiveValue" and record it in "allocaMap" and
   // "PromotableAllocas"
+  const DataLayout &DL = F.getParent()->getDataLayout();
   auto emitAllocaFor = [&](Value *LiveValue) {
-    AllocaInst *Alloca = new AllocaInst(LiveValue->getType(), "",
+    AllocaInst *Alloca = new AllocaInst(LiveValue->getType(),
+                                        DL.getAllocaAddrSpace(), "",
                                         F.getEntryBlock().getFirstNonPHI());
     AllocaMap[LiveValue] = Alloca;
     PromotableAllocas.push_back(Alloca);
@@ -1873,7 +1882,7 @@ chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain,
              "non noop cast is found during rematerialization");
 
       Type *SrcTy = CI->getOperand(0)->getType();
-      Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy);
+      Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy, CI);
 
     } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
       // Cost of the address calculation
@@ -2304,7 +2313,7 @@ static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH,
 
   if (!R.empty())
     AH.setAttributes(AH.getAttributes().removeAttributes(
-        Ctx, Index, AttributeSet::get(Ctx, Index, R)));
+        Ctx, Index, AttributeList::get(Ctx, Index, R)));
 }
 
 void
@@ -2316,7 +2325,7 @@ RewriteStatepointsForGC::stripNonValidAttributesFromPrototype(Function &F) {
       RemoveNonValidAttrAtIndex(Ctx, F, A.getArgNo() + 1);
 
   if (isa<PointerType>(F.getReturnType()))
-    RemoveNonValidAttrAtIndex(Ctx, F, AttributeSet::ReturnIndex);
+    RemoveNonValidAttrAtIndex(Ctx, F, AttributeList::ReturnIndex);
 }
 
 void RewriteStatepointsForGC::stripNonValidAttributesFromBody(Function &F) {
@@ -2351,7 +2360,7 @@ void RewriteStatepointsForGC::stripNonValidAttributesFromBody(Function &F) {
         if (isa<PointerType>(CS.getArgument(i)->getType()))
           RemoveNonValidAttrAtIndex(Ctx, CS, i + 1);
       if (isa<PointerType>(CS.getType()))
-        RemoveNonValidAttrAtIndex(Ctx, CS, AttributeSet::ReturnIndex);
+        RemoveNonValidAttrAtIndex(Ctx, CS, AttributeList::ReturnIndex);
     }
   }
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
index ede381c4c243..8908dae2f545 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -140,6 +140,14 @@ public:
     return nullptr;
   }
 
+  /// getBlockAddress - If this is a constant with a BlockAddress value, return
+  /// it, otherwise return null.
+  BlockAddress *getBlockAddress() const {
+    if (isConstant())
+      return dyn_cast<BlockAddress>(getConstant());
+    return nullptr;
+  }
+
   void markForcedConstant(Constant *V) {
     assert(isUnknown() && "Can't force a defined value!");
     Val.setInt(forcedconstant);
@@ -306,20 +314,14 @@ public:
     return MRVFunctionsTracked;
   }
 
-  void markOverdefined(Value *V) {
-    assert(!V->getType()->isStructTy() &&
-           "structs should use markAnythingOverdefined");
-    markOverdefined(ValueState[V], V);
-  }
-
-  /// markAnythingOverdefined - Mark the specified value overdefined.  This
+  /// markOverdefined - Mark the specified value overdefined.  This
   /// works with both scalars and structs.
-  void markAnythingOverdefined(Value *V) {
+  void markOverdefined(Value *V) {
     if (auto *STy = dyn_cast<StructType>(V->getType()))
       for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
         markOverdefined(getStructValueState(V, i), V);
     else
-      markOverdefined(V);
+      markOverdefined(ValueState[V], V);
   }
 
   // isStructLatticeConstant - Return true if all the lattice values
@@ -513,12 +515,12 @@ private:
   void visitCmpInst(CmpInst &I);
   void visitExtractValueInst(ExtractValueInst &EVI);
   void visitInsertValueInst(InsertValueInst &IVI);
-  void visitLandingPadInst(LandingPadInst &I) { markAnythingOverdefined(&I); }
+  void visitLandingPadInst(LandingPadInst &I) { markOverdefined(&I); }
   void visitFuncletPadInst(FuncletPadInst &FPI) {
-    markAnythingOverdefined(&FPI);
+    markOverdefined(&FPI);
   }
   void visitCatchSwitchInst(CatchSwitchInst &CPI) {
-    markAnythingOverdefined(&CPI);
+    markOverdefined(&CPI);
     visitTerminatorInst(CPI);
   }
 
@@ -538,16 +540,16 @@ private:
   void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ }
   void visitFenceInst     (FenceInst &I) { /*returns void*/ }
   void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
-    markAnythingOverdefined(&I);
+    markOverdefined(&I);
   }
   void visitAtomicRMWInst (AtomicRMWInst &I) { markOverdefined(&I); }
   void visitAllocaInst    (Instruction &I) { markOverdefined(&I); }
-  void visitVAArgInst     (Instruction &I) { markAnythingOverdefined(&I); }
+  void visitVAArgInst     (Instruction &I) { markOverdefined(&I); }
 
   void visitInstruction(Instruction &I) {
     // If a new instruction is added to LLVM that we don't handle.
     DEBUG(dbgs() << "SCCP: Don't know how to handle: " << I << '\n');
-    markAnythingOverdefined(&I);   // Just in case
+    markOverdefined(&I);   // Just in case
   }
 };
 
@@ -602,14 +604,36 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
       return;
     }
 
-    Succs[SI->findCaseValue(CI).getSuccessorIndex()] = true;
+    Succs[SI->findCaseValue(CI)->getSuccessorIndex()] = true;
     return;
   }
 
-  // TODO: This could be improved if the operand is a [cast of a] BlockAddress.
-  if (isa<IndirectBrInst>(&TI)) {
-    // Just mark all destinations executable!
-    Succs.assign(TI.getNumSuccessors(), true);
+  // In case of indirect branch and its address is a blockaddress, we mark
+  // the target as executable.
+  if (auto *IBR = dyn_cast<IndirectBrInst>(&TI)) {
+    // Casts are folded by visitCastInst.
+    LatticeVal IBRValue = getValueState(IBR->getAddress());
+    BlockAddress *Addr = IBRValue.getBlockAddress();
+    if (!Addr) {   // Overdefined or unknown condition?
+      // All destinations are executable!
+      if (!IBRValue.isUnknown())
+        Succs.assign(TI.getNumSuccessors(), true);
+      return;
+    }
+
+    BasicBlock* T = Addr->getBasicBlock();
+    assert(Addr->getFunction() == T->getParent() &&
+           "Block address of a different function ?");
+    for (unsigned i = 0; i < IBR->getNumSuccessors(); ++i) {
+      // This is the target.
+      if (IBR->getDestination(i) == T) {
+        Succs[i] = true;
+        return;
+      }
+    }
+
+    // If we didn't find our destination in the IBR successor list, then we
+    // have undefined behavior. Its ok to assume no successor is executable.
     return;
   }
 
@@ -659,13 +683,21 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
     if (!CI)
       return !SCValue.isUnknown();
 
-    return SI->findCaseValue(CI).getCaseSuccessor() == To;
+    return SI->findCaseValue(CI)->getCaseSuccessor() == To;
   }
 
-  // Just mark all destinations executable!
-  // TODO: This could be improved if the operand is a [cast of a] BlockAddress.
-  if (isa<IndirectBrInst>(TI))
-    return true;
+  // In case of indirect branch and its address is a blockaddress, we mark
+  // the target as executable.
+  if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) {
+    LatticeVal IBRValue = getValueState(IBR->getAddress());
+    BlockAddress *Addr = IBRValue.getBlockAddress();
+
+    if (!Addr)
+      return !IBRValue.isUnknown();
+
+    // At this point, the indirectbr is branching on a blockaddress.
+    return Addr->getBasicBlock() == To;
+  }
 
   DEBUG(dbgs() << "Unknown terminator instruction: " << *TI << '\n');
   llvm_unreachable("SCCP: Don't know how to handle this terminator!");
@@ -693,7 +725,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
   // If this PN returns a struct, just mark the result overdefined.
   // TODO: We could do a lot better than this if code actually uses this.
   if (PN.getType()->isStructTy())
-    return markAnythingOverdefined(&PN);
+    return markOverdefined(&PN);
 
   if (getValueState(&PN).isOverdefined())
     return;  // Quick exit
@@ -803,7 +835,7 @@ void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
   // If this returns a struct, mark all elements over defined, we don't track
   // structs in structs.
   if (EVI.getType()->isStructTy())
-    return markAnythingOverdefined(&EVI);
+    return markOverdefined(&EVI);
 
   // If this is extracting from more than one level of struct, we don't know.
   if (EVI.getNumIndices() != 1)
@@ -828,7 +860,7 @@ void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
   // If this has more than one index, we can't handle it, drive all results to
   // undef.
   if (IVI.getNumIndices() != 1)
-    return markAnythingOverdefined(&IVI);
+    return markOverdefined(&IVI);
 
   Value *Aggr = IVI.getAggregateOperand();
   unsigned Idx = *IVI.idx_begin();
@@ -857,7 +889,7 @@ void SCCPSolver::visitSelectInst(SelectInst &I) {
   // If this select returns a struct, just mark the result overdefined.
   // TODO: We could do a lot better than this if code actually uses this.
   if (I.getType()->isStructTy())
-    return markAnythingOverdefined(&I);
+    return markOverdefined(&I);
 
   LatticeVal CondValue = getValueState(I.getCondition());
   if (CondValue.isUnknown())
@@ -910,9 +942,16 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
 
   // Otherwise, one of our operands is overdefined.  Try to produce something
   // better than overdefined with some tricks.
-
-  // If this is an AND or OR with 0 or -1, it doesn't matter that the other
-  // operand is overdefined.
+  // If this is 0 / Y, it doesn't matter that the second operand is
+  // overdefined, and we can replace it with zero.
+  if (I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv)
+    if (V1State.isConstant() && V1State.getConstant()->isNullValue())
+      return markConstant(IV, &I, V1State.getConstant());
+
+  // If this is:
+  // -> AND/MUL with 0
+  // -> OR with -1
+  // it doesn't matter that the other operand is overdefined.
   if (I.getOpcode() == Instruction::And || I.getOpcode() == Instruction::Mul ||
       I.getOpcode() == Instruction::Or) {
     LatticeVal *NonOverdefVal = nullptr;
@@ -1021,7 +1060,7 @@ void SCCPSolver::visitStoreInst(StoreInst &SI) {
 void SCCPSolver::visitLoadInst(LoadInst &I) {
   // If this load is of a struct, just mark the result overdefined.
   if (I.getType()->isStructTy())
-    return markAnythingOverdefined(&I);
+    return markOverdefined(&I);
 
   LatticeVal PtrVal = getValueState(I.getOperand(0));
   if (PtrVal.isUnknown()) return;   // The pointer is not resolved yet!
@@ -1107,7 +1146,7 @@ CallOverdefined:
     }
 
     // Otherwise, we don't know anything about this call, mark it overdefined.
-    return markAnythingOverdefined(I);
+    return markOverdefined(I);
   }
 
   // If this is a local function that doesn't have its address taken, mark its
@@ -1483,6 +1522,31 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       return true;
     }
 
+   if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) {
+      // Indirect branch with no successor ?. Its ok to assume it branches
+      // to no target.
+      if (IBR->getNumSuccessors() < 1)
+        continue;
+
+      if (!getValueState(IBR->getAddress()).isUnknown())
+        continue;
+
+      // If the input to SCCP is actually branch on undef, fix the undef to
+      // the first successor of the indirect branch.
+      if (isa<UndefValue>(IBR->getAddress())) {
+        IBR->setAddress(BlockAddress::get(IBR->getSuccessor(0)));
+        markEdgeExecutable(&BB, IBR->getSuccessor(0));
+        return true;
+      }
+
+      // Otherwise, it is a branch on a symbolic value which is currently
+      // considered to be undef.  Handle this by forcing the input value to the
+      // branch to the first successor.
+      markForcedConstant(IBR->getAddress(),
+                         BlockAddress::get(IBR->getSuccessor(0)));
+      return true;
+    }
+
     if (auto *SI = dyn_cast<SwitchInst>(TI)) {
       if (!SI->getNumCases() || !getValueState(SI->getCondition()).isUnknown())
         continue;
@@ -1490,12 +1554,12 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       // If the input to SCCP is actually switch on undef, fix the undef to
       // the first constant.
       if (isa<UndefValue>(SI->getCondition())) {
-        SI->setCondition(SI->case_begin().getCaseValue());
-        markEdgeExecutable(&BB, SI->case_begin().getCaseSuccessor());
+        SI->setCondition(SI->case_begin()->getCaseValue());
+        markEdgeExecutable(&BB, SI->case_begin()->getCaseSuccessor());
         return true;
       }
 
-      markForcedConstant(SI->getCondition(), SI->case_begin().getCaseValue());
+      markForcedConstant(SI->getCondition(), SI->case_begin()->getCaseValue());
       return true;
     }
   }
@@ -1545,7 +1609,7 @@ static bool runSCCP(Function &F, const DataLayout &DL,
 
   // Mark all arguments to the function as being overdefined.
   for (Argument &AI : F.args())
-    Solver.markAnythingOverdefined(&AI);
+    Solver.markOverdefined(&AI);
 
   // Solve for constants.
   bool ResolvedUndefs = true;
@@ -1728,7 +1792,7 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
 
     // Assume nothing about the incoming arguments.
     for (Argument &AI : F.args())
-      Solver.markAnythingOverdefined(&AI);
+      Solver.markOverdefined(&AI);
   }
 
   // Loop over global variables.  We inform the solver about any internal global
@@ -1817,32 +1881,9 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
         if (!I) continue;
 
         bool Folded = ConstantFoldTerminator(I->getParent());
-        if (!Folded) {
-          // The constant folder may not have been able to fold the terminator
-          // if this is a branch or switch on undef.  Fold it manually as a
-          // branch to the first successor.
-#ifndef NDEBUG
-          if (auto *BI = dyn_cast<BranchInst>(I)) {
-            assert(BI->isConditional() && isa<UndefValue>(BI->getCondition()) &&
-                   "Branch should be foldable!");
-          } else if (auto *SI = dyn_cast<SwitchInst>(I)) {
-            assert(isa<UndefValue>(SI->getCondition()) && "Switch should fold");
-          } else {
-            llvm_unreachable("Didn't fold away reference to block!");
-          }
-#endif
-
-          // Make this an uncond branch to the first successor.
-          TerminatorInst *TI = I->getParent()->getTerminator();
-          BranchInst::Create(TI->getSuccessor(0), TI);
-
-          // Remove entries in successor phi nodes to remove edges.
-          for (unsigned i = 1, e = TI->getNumSuccessors(); i != e; ++i)
-            TI->getSuccessor(i)->removePredecessor(TI->getParent());
-
-          // Remove the old terminator.
-          TI->eraseFromParent();
-        }
+        assert(Folded &&
+              "Expect TermInst on constantint or blockaddress to be folded");
+        (void) Folded;
       }
 
       // Finally, delete the basic block.
diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
index bfcb15530ef5..d01e91a7f235 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -1825,6 +1825,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
     // Rank the remaining candidate vector types. This is easy because we know
     // they're all integer vectors. We sort by ascending number of elements.
     auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
+      (void)DL;
       assert(DL.getTypeSizeInBits(RHSTy) == DL.getTypeSizeInBits(LHSTy) &&
              "Cannot have vector types of different sizes!");
       assert(RHSTy->getElementType()->isIntegerTy() &&
@@ -2294,7 +2295,8 @@ private:
 #endif
 
     return getAdjustedPtr(IRB, DL, &NewAI,
-                          APInt(DL.getPointerSizeInBits(), Offset), PointerTy,
+                          APInt(DL.getPointerTypeSizeInBits(PointerTy), Offset),
+                          PointerTy,
 #ifndef NDEBUG
                           Twine(OldName) + "."
 #else
@@ -2369,6 +2371,8 @@ private:
     Value *OldOp = LI.getOperand(0);
     assert(OldOp == OldPtr);
 
+    unsigned AS = LI.getPointerAddressSpace();
+
     Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
                              : LI.getType();
     const bool IsLoadPastEnd = DL.getTypeStoreSize(TargetTy) > SliceSize;
@@ -2387,6 +2391,10 @@ private:
                                               LI.isVolatile(), LI.getName());
       if (LI.isVolatile())
         NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope());
+
+      // Try to preserve nonnull metadata
+      if (TargetTy->isPointerTy())
+        NewLI->copyMetadata(LI, LLVMContext::MD_nonnull);
       V = NewLI;
 
       // If this is an integer load past the end of the slice (which means the
@@ -2401,7 +2409,7 @@ private:
                                 "endian_shift");
           }
     } else {
-      Type *LTy = TargetTy->getPointerTo();
+      Type *LTy = TargetTy->getPointerTo(AS);
       LoadInst *NewLI = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy),
                                               getSliceAlign(TargetTy),
                                               LI.isVolatile(), LI.getName());
@@ -2429,7 +2437,7 @@ private:
       // the computed value, and then replace the placeholder with LI, leaving
       // LI only used for this computation.
       Value *Placeholder =
-          new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
+          new LoadInst(UndefValue::get(LI.getType()->getPointerTo(AS)));
       V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
                         "insert");
       LI.replaceAllUsesWith(V);
@@ -2542,7 +2550,8 @@ private:
       NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
                                      SI.isVolatile());
     } else {
-      Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo());
+      unsigned AS = SI.getPointerAddressSpace();
+      Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo(AS));
       NewSI = IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(V->getType()),
                                      SI.isVolatile());
     }
@@ -3857,7 +3866,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
     if (Alignment <= DL.getABITypeAlignment(SliceTy))
       Alignment = 0;
     NewAI = new AllocaInst(
-        SliceTy, nullptr, Alignment,
+      SliceTy, AI.getType()->getAddressSpace(), nullptr, Alignment,
         AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
     ++NumNewAllocas;
   }
@@ -4184,7 +4193,7 @@ bool SROA::promoteAllocas(Function &F) {
   NumPromoted += PromotableAllocas.size();
 
   DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
-  PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC);
+  PromoteMemToReg(PromotableAllocas, *DT, AC);
   PromotableAllocas.clear();
   return true;
 }
@@ -4234,9 +4243,8 @@ PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
   if (!Changed)
     return PreservedAnalyses::all();
 
-  // FIXME: Even when promoting allocas we should preserve some abstract set of
-  // CFG-specific analyses.
   PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
   return PA;
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
index afe7483006ae..00e3c95f6f06 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -43,13 +43,14 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeDSELegacyPassPass(Registry);
   initializeGuardWideningLegacyPassPass(Registry);
   initializeGVNLegacyPassPass(Registry);
-  initializeNewGVNPass(Registry);
+  initializeNewGVNLegacyPassPass(Registry);
   initializeEarlyCSELegacyPassPass(Registry);
   initializeEarlyCSEMemSSALegacyPassPass(Registry);
   initializeGVNHoistLegacyPassPass(Registry);
   initializeFlattenCFGPassPass(Registry);
   initializeInductiveRangeCheckEliminationPass(Registry);
   initializeIndVarSimplifyLegacyPassPass(Registry);
+  initializeInferAddressSpacesPass(Registry);
   initializeJumpThreadingPass(Registry);
   initializeLegacyLICMPassPass(Registry);
   initializeLegacyLoopSinkPassPass(Registry);
@@ -58,6 +59,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLoopAccessLegacyAnalysisPass(Registry);
   initializeLoopInstSimplifyLegacyPassPass(Registry);
   initializeLoopInterchangePass(Registry);
+  initializeLoopPredicationLegacyPassPass(Registry);
   initializeLoopRotateLegacyPassPass(Registry);
   initializeLoopStrengthReducePass(Registry);
   initializeLoopRerollPass(Registry);
@@ -79,6 +81,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeIPSCCPLegacyPassPass(Registry);
   initializeSROALegacyPassPass(Registry);
   initializeCFGSimplifyPassPass(Registry);
+  initializeLateCFGSimplifyPassPass(Registry);
   initializeStructurizeCFGPass(Registry);
   initializeSinkingLegacyPassPass(Registry);
   initializeTailCallElimPass(Registry);
@@ -115,6 +118,10 @@ void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createCFGSimplificationPass());
 }
 
+void LLVMAddLateCFGSimplificationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLateCFGSimplificationPass());
+}
+
 void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createDeadStoreEliminationPass());
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index 39969e27367f..c0c09a7e43fe 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -520,12 +520,25 @@ bool Scalarizer::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
   unsigned NumElems = VT->getNumElements();
   unsigned NumIndices = GEPI.getNumIndices();
 
-  Scatterer Base = scatter(&GEPI, GEPI.getOperand(0));
+  // The base pointer might be scalar even if it's a vector GEP. In those cases,
+  // splat the pointer into a vector value, and scatter that vector.
+  Value *Op0 = GEPI.getOperand(0);
+  if (!Op0->getType()->isVectorTy())
+    Op0 = Builder.CreateVectorSplat(NumElems, Op0);
+  Scatterer Base = scatter(&GEPI, Op0);
 
   SmallVector<Scatterer, 8> Ops;
   Ops.resize(NumIndices);
-  for (unsigned I = 0; I < NumIndices; ++I)
-    Ops[I] = scatter(&GEPI, GEPI.getOperand(I + 1));
+  for (unsigned I = 0; I < NumIndices; ++I) {
+    Value *Op = GEPI.getOperand(I + 1);
+
+    // The indices might be scalars even if it's a vector GEP. In those cases,
+    // splat the scalar into a vector value, and scatter that vector.
+    if (!Op->getType()->isVectorTy())
+      Op = Builder.CreateVectorSplat(NumElems, Op);
+
+    Ops[I] = scatter(&GEPI, Op);
+  }
 
   ValueVector Res;
   Res.resize(NumElems);
diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index f2723bd7af82..8754c714c5b2 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -130,7 +130,8 @@ static bool mergeEmptyReturnBlocks(Function &F) {
 /// iterating until no more changes are made.
 static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
                                    AssumptionCache *AC,
-                                   unsigned BonusInstThreshold) {
+                                   unsigned BonusInstThreshold,
+                                   bool LateSimplifyCFG) {
   bool Changed = false;
   bool LocalChange = true;
 
@@ -145,7 +146,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
 
     // Loop over all of the basic blocks and remove them if they are unneeded.
     for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
-      if (SimplifyCFG(&*BBIt++, TTI, BonusInstThreshold, AC, &LoopHeaders)) {
+      if (SimplifyCFG(&*BBIt++, TTI, BonusInstThreshold, AC, &LoopHeaders, LateSimplifyCFG)) {
         LocalChange = true;
         ++NumSimpl;
       }
@@ -156,10 +157,12 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
 }
 
 static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
-                                AssumptionCache *AC, int BonusInstThreshold) {
+                                AssumptionCache *AC, int BonusInstThreshold,
+                                bool LateSimplifyCFG) {
   bool EverChanged = removeUnreachableBlocks(F);
   EverChanged |= mergeEmptyReturnBlocks(F);
-  EverChanged |= iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold);
+  EverChanged |= iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold,
+                                        LateSimplifyCFG);
 
   // If neither pass changed anything, we're done.
   if (!EverChanged) return false;
@@ -173,7 +176,8 @@ static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
     return true;
 
   do {
-    EverChanged = iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold);
+    EverChanged = iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold,
+                                         LateSimplifyCFG);
     EverChanged |= removeUnreachableBlocks(F);
   } while (EverChanged);
 
@@ -181,17 +185,19 @@ static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
 }
 
 SimplifyCFGPass::SimplifyCFGPass()
-    : BonusInstThreshold(UserBonusInstThreshold) {}
+    : BonusInstThreshold(UserBonusInstThreshold),
+      LateSimplifyCFG(true) {}
 
-SimplifyCFGPass::SimplifyCFGPass(int BonusInstThreshold)
-    : BonusInstThreshold(BonusInstThreshold) {}
+SimplifyCFGPass::SimplifyCFGPass(int BonusInstThreshold, bool LateSimplifyCFG)
+    : BonusInstThreshold(BonusInstThreshold),
+      LateSimplifyCFG(LateSimplifyCFG) {}
 
 PreservedAnalyses SimplifyCFGPass::run(Function &F,
                                        FunctionAnalysisManager &AM) {
   auto &TTI = AM.getResult<TargetIRAnalysis>(F);
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
 
-  if (!simplifyFunctionCFG(F, TTI, &AC, BonusInstThreshold))
+  if (!simplifyFunctionCFG(F, TTI, &AC, BonusInstThreshold, LateSimplifyCFG))
     return PreservedAnalyses::all();
   PreservedAnalyses PA;
   PA.preserve<GlobalsAA>();
@@ -199,16 +205,17 @@ PreservedAnalyses SimplifyCFGPass::run(Function &F,
 }
 
 namespace {
-struct CFGSimplifyPass : public FunctionPass {
-  static char ID; // Pass identification, replacement for typeid
+struct BaseCFGSimplifyPass : public FunctionPass {
   unsigned BonusInstThreshold;
   std::function<bool(const Function &)> PredicateFtor;
+  bool LateSimplifyCFG;
 
-  CFGSimplifyPass(int T = -1,
-                  std::function<bool(const Function &)> Ftor = nullptr)
-      : FunctionPass(ID), PredicateFtor(std::move(Ftor)) {
+  BaseCFGSimplifyPass(int T, bool LateSimplifyCFG,
+                      std::function<bool(const Function &)> Ftor,
+                      char &ID)
+      : FunctionPass(ID), PredicateFtor(std::move(Ftor)),
+        LateSimplifyCFG(LateSimplifyCFG) {
     BonusInstThreshold = (T == -1) ? UserBonusInstThreshold : unsigned(T);
-    initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
   }
   bool runOnFunction(Function &F) override {
     if (skipFunction(F) || (PredicateFtor && !PredicateFtor(F)))
@@ -218,7 +225,7 @@ struct CFGSimplifyPass : public FunctionPass {
         &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
     const TargetTransformInfo &TTI =
         getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-    return simplifyFunctionCFG(F, TTI, AC, BonusInstThreshold);
+    return simplifyFunctionCFG(F, TTI, AC, BonusInstThreshold, LateSimplifyCFG);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -227,6 +234,26 @@ struct CFGSimplifyPass : public FunctionPass {
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
 };
+
+struct CFGSimplifyPass : public BaseCFGSimplifyPass {
+  static char ID; // Pass identification, replacement for typeid
+
+  CFGSimplifyPass(int T = -1,
+                  std::function<bool(const Function &)> Ftor = nullptr)
+                  : BaseCFGSimplifyPass(T, false, Ftor, ID) {
+    initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
+  }
+};
+
+struct LateCFGSimplifyPass : public BaseCFGSimplifyPass {
+  static char ID; // Pass identification, replacement for typeid
+
+  LateCFGSimplifyPass(int T = -1,
+                      std::function<bool(const Function &)> Ftor = nullptr)
+                      : BaseCFGSimplifyPass(T, true, Ftor, ID) {
+    initializeLateCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
+  }
+};
 }
 
 char CFGSimplifyPass::ID = 0;
@@ -237,9 +264,24 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
                     false)
 
+char LateCFGSimplifyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LateCFGSimplifyPass, "latesimplifycfg",
+                      "Simplify the CFG more aggressively", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(LateCFGSimplifyPass, "latesimplifycfg",
+                    "Simplify the CFG more aggressively", false, false)
+
 // Public interface to the CFGSimplification pass
 FunctionPass *
 llvm::createCFGSimplificationPass(int Threshold,
-                                  std::function<bool(const Function &)> Ftor) {
+    std::function<bool(const Function &)> Ftor) {
   return new CFGSimplifyPass(Threshold, std::move(Ftor));
 }
+
+// Public interface to the LateCFGSimplification pass
+FunctionPass *
+llvm::createLateCFGSimplificationPass(int Threshold, 
+                                  std::function<bool(const Function &)> Ftor) {
+  return new LateCFGSimplifyPass(Threshold, std::move(Ftor));
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
index c3f14a0f4b1e..102e9eaeab77 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
@@ -164,13 +164,14 @@ static bool SinkInstruction(Instruction *Inst,
 
   // Instructions can only be sunk if all their uses are in blocks
   // dominated by one of the successors.
-  // Look at all the postdominators and see if we can sink it in one.
+  // Look at all the dominated blocks and see if we can sink it in one.
   DomTreeNode *DTN = DT.getNode(Inst->getParent());
   for (DomTreeNode::iterator I = DTN->begin(), E = DTN->end();
       I != E && SuccToSinkTo == nullptr; ++I) {
     BasicBlock *Candidate = (*I)->getBlock();
-    if ((*I)->getIDom()->getBlock() == Inst->getParent() &&
-        IsAcceptableTarget(Inst, Candidate, DT, LI))
+    // A node always immediate-dominates its children on the dominator
+    // tree.
+    if (IsAcceptableTarget(Inst, Candidate, DT, LI))
       SuccToSinkTo = Candidate;
   }
 
@@ -262,9 +263,8 @@ PreservedAnalyses SinkingPass::run(Function &F, FunctionAnalysisManager &AM) {
   if (!iterativelySinkInstructions(F, DT, LI, AA))
     return PreservedAnalyses::all();
 
-  auto PA = PreservedAnalyses();
-  PA.preserve<DominatorTreeAnalysis>();
-  PA.preserve<LoopAnalysis>();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
   return PA;
 }