155 files changed, 4496 insertions, 2102 deletions
diff --git a/lib/Analysis/AssumptionCache.cpp b/lib/Analysis/AssumptionCache.cpp
index 3c518034ba62..aa55d79b761e 100644
--- a/lib/Analysis/AssumptionCache.cpp
+++ b/lib/Analysis/AssumptionCache.cpp
@@ -24,6 +24,109 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
+SmallVector<WeakVH, 1> &AssumptionCache::getAffectedValues(Value *V) {
+  // Try using find_as first to avoid creating extra value handles just for the
+  // purpose of doing the lookup.
+  auto AVI = AffectedValues.find_as(V);
+  if (AVI != AffectedValues.end())
+    return AVI->second;
+
+  auto AVIP = AffectedValues.insert({
+      AffectedValueCallbackVH(V, this), SmallVector<WeakVH, 1>()});
+  return AVIP.first->second;
+}
+
+void AssumptionCache::updateAffectedValues(CallInst *CI) {
+  // Note: This code must be kept in-sync with the code in
+  // computeKnownBitsFromAssume in ValueTracking.
+
+  SmallVector<Value *, 16> Affected;
+  auto AddAffected = [&Affected](Value *V) {
+    if (isa<Argument>(V)) {
+      Affected.push_back(V);
+    } else if (auto *I = dyn_cast<Instruction>(V)) {
+      Affected.push_back(I);
+
+      if (I->getOpcode() == Instruction::BitCast ||
+          I->getOpcode() == Instruction::PtrToInt) {
+        auto *Op = I->getOperand(0);
+        if (isa<Instruction>(Op) || isa<Argument>(Op))
+          Affected.push_back(Op);
+      }
+    }
+  };
+
+  Value *Cond = CI->getArgOperand(0), *A, *B;
+  AddAffected(Cond);
+
+  CmpInst::Predicate Pred;
+  if (match(Cond, m_ICmp(Pred, m_Value(A), m_Value(B)))) {
+    AddAffected(A);
+    AddAffected(B);
+
+    if (Pred == ICmpInst::ICMP_EQ) {
+      // For equality comparisons, we handle the case of bit inversion.
+      auto AddAffectedFromEq = [&AddAffected](Value *V) {
+        Value *A;
+        if (match(V, m_Not(m_Value(A)))) {
+          AddAffected(A);
+          V = A;
+        }
+
+        Value *B;
+        ConstantInt *C;
+        // (A & B) or (A | B) or (A ^ B).
+        if (match(V,
+                  m_CombineOr(m_And(m_Value(A), m_Value(B)),
+                    m_CombineOr(m_Or(m_Value(A), m_Value(B)),
+                                m_Xor(m_Value(A), m_Value(B)))))) {
+          AddAffected(A);
+          AddAffected(B);
+        // (A << C) or (A >>_s C) or (A >>_u C) where C is some constant.
+        } else if (match(V,
+                         m_CombineOr(m_Shl(m_Value(A), m_ConstantInt(C)),
+                           m_CombineOr(m_LShr(m_Value(A), m_ConstantInt(C)),
+                                       m_AShr(m_Value(A),
+                                              m_ConstantInt(C)))))) {
+          AddAffected(A);
+        }
+      };
+
+      AddAffectedFromEq(A);
+      AddAffectedFromEq(B);
+    }
+  }
+
+  for (auto &AV : Affected) {
+    auto &AVV = getAffectedValues(AV);
+    if (std::find(AVV.begin(), AVV.end(), CI) == AVV.end())
+      AVV.push_back(CI);
+  }
+}
+
+void AssumptionCache::AffectedValueCallbackVH::deleted() {
+  auto AVI = AC->AffectedValues.find(getValPtr());
+  if (AVI != AC->AffectedValues.end())
+    AC->AffectedValues.erase(AVI);
+  // 'this' now dangles!
+}
+
+void AssumptionCache::AffectedValueCallbackVH::allUsesReplacedWith(Value *NV) {
+  if (!isa<Instruction>(NV) && !isa<Argument>(NV))
+    return;
+
+  // Any assumptions that affected this value now affect the new value.
+
+  auto &NAVV = AC->getAffectedValues(NV);
+  auto AVI = AC->AffectedValues.find(getValPtr());
+  if (AVI == AC->AffectedValues.end())
+    return;
+
+  for (auto &A : AVI->second)
+    if (std::find(NAVV.begin(), NAVV.end(), A) == NAVV.end())
+      NAVV.push_back(A);
+}
+
 void AssumptionCache::scanFunction() {
   assert(!Scanned && "Tried to scan the function twice!");
   assert(AssumeHandles.empty() && "Already have assumes when scanning!");
@@ -37,6 +140,10 @@ void AssumptionCache::scanFunction() {
 
   // Mark the scan as complete.
   Scanned = true;
+
+  // Update affected values.
+  for (auto &A : AssumeHandles)
+    updateAffectedValues(cast<CallInst>(A));
 }
 
 void AssumptionCache::registerAssumption(CallInst *CI) {
@@ -72,6 +179,8 @@ void AssumptionCache::registerAssumption(CallInst *CI) {
            "Cache contains multiple copies of a call!");
   }
 #endif
+
+  updateAffectedValues(CI);
 }
 
 AnalysisKey AssumptionAnalysis::Key;
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index 08d50c29dfc8..d53364373d7b 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -44,10 +44,10 @@ add_llvm_library(LLVMAnalysis
   Lint.cpp
   Loads.cpp
   LoopAccessAnalysis.cpp
+  LoopAnalysisManager.cpp
   LoopUnrollAnalyzer.cpp
   LoopInfo.cpp
   LoopPass.cpp
-  LoopPassManager.cpp
   MemDepPrinter.cpp
   MemDerefPrinter.cpp
   MemoryBuiltins.cpp
diff --git a/lib/Analysis/CostModel.cpp b/lib/Analysis/CostModel.cpp
index 67d1773f0811..6b77397956cd 100644
--- a/lib/Analysis/CostModel.cpp
+++ b/lib/Analysis/CostModel.cpp
@@ -438,8 +438,11 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
       getOperandInfo(I->getOperand(0));
     TargetTransformInfo::OperandValueKind Op2VK =
       getOperandInfo(I->getOperand(1));
+    SmallVector<const Value*, 2> Operands(I->operand_values()); 
     return TTI->getArithmeticInstrCost(I->getOpcode(), I->getType(), Op1VK,
-                                       Op2VK);
+                                       Op2VK, TargetTransformInfo::OP_None, 
+                                       TargetTransformInfo::OP_None, 
+                                       Operands);
   }
   case Instruction::Select: {
     const SelectInst *SI = cast<SelectInst>(I);
diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp
index 76e2561b9da3..a661b0101e6a 100644
--- a/lib/Analysis/IVUsers.cpp
+++ b/lib/Analysis/IVUsers.cpp
@@ -16,8 +16,8 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constants.h"
@@ -36,20 +36,9 @@ using namespace llvm;
 
 AnalysisKey IVUsersAnalysis::Key;
 
-IVUsers IVUsersAnalysis::run(Loop &L, LoopAnalysisManager &AM) {
-  const auto &FAM =
-      AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
-  Function *F = L.getHeader()->getParent();
-
-  return IVUsers(&L, FAM.getCachedResult<AssumptionAnalysis>(*F),
-                 FAM.getCachedResult<LoopAnalysis>(*F),
-                 FAM.getCachedResult<DominatorTreeAnalysis>(*F),
-                 FAM.getCachedResult<ScalarEvolutionAnalysis>(*F));
-}
-
-PreservedAnalyses IVUsersPrinterPass::run(Loop &L, LoopAnalysisManager &AM) {
-  AM.getResult<IVUsersAnalysis>(L).print(OS);
-  return PreservedAnalyses::all();
+IVUsers IVUsersAnalysis::run(Loop &L, LoopAnalysisManager &AM,
+                             LoopStandardAnalysisResults &AR) {
+  return IVUsers(&L, &AR.AC, &AR.LI, &AR.DT, &AR.SE);
 }
 
 char IVUsersWrapperPass::ID = 0;
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index 9b9faacd354c..4109049ecabc 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -636,30 +636,27 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
   else if (Caller->optForSize())
     Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
 
-  bool HotCallsite = false;
-  uint64_t TotalWeight;
-  if (PSI && CS.getInstruction()->extractProfTotalWeight(TotalWeight) &&
-      PSI->isHotCount(TotalWeight)) {
-    HotCallsite = true;
+  // Adjust the threshold based on inlinehint attribute and profile based
+  // hotness information if the caller does not have MinSize attribute.
+  if (!Caller->optForMinSize()) {
+    if (Callee.hasFnAttribute(Attribute::InlineHint))
+      Threshold = MaxIfValid(Threshold, Params.HintThreshold);
+    if (PSI) {
+      uint64_t TotalWeight;
+      if (CS.getInstruction()->extractProfTotalWeight(TotalWeight) &&
+          PSI->isHotCount(TotalWeight)) {
+        Threshold = MaxIfValid(Threshold, Params.HotCallSiteThreshold);
+      } else if (PSI->isFunctionEntryHot(&Callee)) {
+        // If callsite hotness can not be determined, we may still know
+        // that the callee is hot and treat it as a weaker hint for threshold
+        // increase.
+        Threshold = MaxIfValid(Threshold, Params.HintThreshold);
+      } else if (PSI->isFunctionEntryCold(&Callee)) {
+        Threshold = MinIfValid(Threshold, Params.ColdThreshold);
+      }
+    }
   }
 
-  // Listen to the inlinehint attribute or profile based hotness information
-  // when it would increase the threshold and the caller does not need to
-  // minimize its size.
-  bool InlineHint = Callee.hasFnAttribute(Attribute::InlineHint) ||
-                    (PSI && PSI->isFunctionEntryHot(&Callee));
-  if (InlineHint && !Caller->optForMinSize())
-    Threshold = MaxIfValid(Threshold, Params.HintThreshold);
-
-  if (HotCallsite && !Caller->optForMinSize())
-    Threshold = MaxIfValid(Threshold, Params.HotCallSiteThreshold);
-
-  bool ColdCallee = PSI && PSI->isFunctionEntryCold(&Callee);
-  // For cold callees, use the ColdThreshold knob if it is available and reduces
-  // the threshold.
-  if (ColdCallee)
-    Threshold = MinIfValid(Threshold, Params.ColdThreshold);
-
   // Finally, take the target-specific inlining threshold multiplier into
   // account.
   Threshold *= TTI.getInliningThresholdMultiplier();
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 8da2f0981d0c..796e6e444980 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -3583,7 +3583,7 @@ static Value *simplifySelectBitTest(Value *TrueVal, Value *FalseVal, Value *X,
         *Y == *C)
       return TrueWhenUnset ? TrueVal : FalseVal;
   }
-  
+
   return nullptr;
 }
 
@@ -3595,7 +3595,7 @@ static Value *simplifySelectWithFakeICmpEq(Value *CmpLHS, Value *TrueVal,
   unsigned BitWidth = TrueVal->getType()->getScalarSizeInBits();
   if (!BitWidth)
     return nullptr;
-  
+
   APInt MinSignedValue;
   Value *X;
   if (match(CmpLHS, m_Trunc(m_Value(X))) && (X == TrueVal || X == FalseVal)) {
@@ -4252,14 +4252,36 @@ static Value *SimplifyIntrinsic(Function *F, IterTy ArgBegin, IterTy ArgEnd,
                                 const Query &Q, unsigned MaxRecurse) {
   Intrinsic::ID IID = F->getIntrinsicID();
   unsigned NumOperands = std::distance(ArgBegin, ArgEnd);
-  Type *ReturnType = F->getReturnType();
+
+  // Unary Ops
+  if (NumOperands == 1) {
+    // Perform idempotent optimizations
+    if (IsIdempotent(IID)) {
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(*ArgBegin)) {
+        if (II->getIntrinsicID() == IID)
+          return II;
+      }
+    }
+
+    switch (IID) {
+    case Intrinsic::fabs: {
+      if (SignBitMustBeZero(*ArgBegin, Q.TLI))
+        return *ArgBegin;
+    }
+    default:
+      return nullptr;
+    }
+  }
 
   // Binary Ops
   if (NumOperands == 2) {
     Value *LHS = *ArgBegin;
     Value *RHS = *(ArgBegin + 1);
-    if (IID == Intrinsic::usub_with_overflow ||
-        IID == Intrinsic::ssub_with_overflow) {
+    Type *ReturnType = F->getReturnType();
+
+    switch (IID) {
+    case Intrinsic::usub_with_overflow:
+    case Intrinsic::ssub_with_overflow: {
       // X - X -> { 0, false }
       if (LHS == RHS)
         return Constant::getNullValue(ReturnType);
@@ -4268,17 +4290,19 @@ static Value *SimplifyIntrinsic(Function *F, IterTy ArgBegin, IterTy ArgEnd,
       // undef - X -> undef
       if (isa<UndefValue>(LHS) || isa<UndefValue>(RHS))
         return UndefValue::get(ReturnType);
-    }
 
-    if (IID == Intrinsic::uadd_with_overflow ||
-        IID == Intrinsic::sadd_with_overflow) {
+      return nullptr;
+    }
+    case Intrinsic::uadd_with_overflow:
+    case Intrinsic::sadd_with_overflow: {
       // X + undef -> undef
       if (isa<UndefValue>(RHS))
         return UndefValue::get(ReturnType);
-    }
 
-    if (IID == Intrinsic::umul_with_overflow ||
-        IID == Intrinsic::smul_with_overflow) {
+      return nullptr;
+    }
+    case Intrinsic::umul_with_overflow:
+    case Intrinsic::smul_with_overflow: {
       // X * 0 -> { 0, false }
       if (match(RHS, m_Zero()))
         return Constant::getNullValue(ReturnType);
@@ -4286,34 +4310,34 @@ static Value *SimplifyIntrinsic(Function *F, IterTy ArgBegin, IterTy ArgEnd,
       // X * undef -> { 0, false }
       if (match(RHS, m_Undef()))
         return Constant::getNullValue(ReturnType);
-    }
 
-    if (IID == Intrinsic::load_relative && isa<Constant>(LHS) &&
-        isa<Constant>(RHS))
-      return SimplifyRelativeLoad(cast<Constant>(LHS), cast<Constant>(RHS),
-                                  Q.DL);
+      return nullptr;
+    }
+    case Intrinsic::load_relative: {
+      Constant *C0 = dyn_cast<Constant>(LHS);
+      Constant *C1 = dyn_cast<Constant>(RHS);
+      if (C0 && C1)
+        return SimplifyRelativeLoad(C0, C1, Q.DL);
+      return nullptr;
+    }
+    default:
+      return nullptr;
+    }
   }
 
   // Simplify calls to llvm.masked.load.*
-  if (IID == Intrinsic::masked_load) {
+  switch (IID) {
+  case Intrinsic::masked_load: {
     Value *MaskArg = ArgBegin[2];
     Value *PassthruArg = ArgBegin[3];
     // If the mask is all zeros or undef, the "passthru" argument is the result.
     if (maskIsAllZeroOrUndef(MaskArg))
       return PassthruArg;
+    return nullptr;
   }
-
-  // Perform idempotent optimizations
-  if (!IsIdempotent(IID))
+  default:
     return nullptr;
-
-  // Unary Ops
-  if (NumOperands == 1)
-    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(*ArgBegin))
-      if (II->getIntrinsicID() == IID)
-        return II;
-
-  return nullptr;
+  }
 }
 
 template <typename IterTy>
diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp
index 4f6355236873..d442310476cf 100644
--- a/lib/Analysis/LazyValueInfo.cpp
+++ b/lib/Analysis/LazyValueInfo.cpp
@@ -925,7 +925,7 @@ void LazyValueInfoImpl::intersectAssumeOrGuardBlockValueConstantRange(
   if (!BBI)
     return;
 
-  for (auto &AssumeVH : AC->assumptions()) {
+  for (auto &AssumeVH : AC->assumptionsFor(Val)) {
     if (!AssumeVH)
       continue;
     auto *I = cast<CallInst>(AssumeVH);
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp
index 2f3dca3d23fa..bf8007213097 100644
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -12,22 +12,22 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/EquivalenceClasses.h"
-#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -44,10 +44,10 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
@@ -2120,35 +2120,9 @@ INITIALIZE_PASS_END(LoopAccessLegacyAnalysis, LAA_NAME, laa_name, false, true)
 
 AnalysisKey LoopAccessAnalysis::Key;
 
-LoopAccessInfo LoopAccessAnalysis::run(Loop &L, LoopAnalysisManager &AM) {
-  const FunctionAnalysisManager &FAM =
-      AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
-  Function &F = *L.getHeader()->getParent();
-  auto *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(F);
-  auto *TLI = FAM.getCachedResult<TargetLibraryAnalysis>(F);
-  auto *AA = FAM.getCachedResult<AAManager>(F);
-  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
-  auto *LI = FAM.getCachedResult<LoopAnalysis>(F);
-  if (!SE)
-    report_fatal_error(
-        "ScalarEvolution must have been cached at a higher level");
-  if (!AA)
-    report_fatal_error("AliasAnalysis must have been cached at a higher level");
-  if (!DT)
-    report_fatal_error("DominatorTree must have been cached at a higher level");
-  if (!LI)
-    report_fatal_error("LoopInfo must have been cached at a higher level");
-  return LoopAccessInfo(&L, SE, TLI, AA, DT, LI);
-}
-
-PreservedAnalyses LoopAccessInfoPrinterPass::run(Loop &L,
-                                                 LoopAnalysisManager &AM) {
-  Function &F = *L.getHeader()->getParent();
-  auto &LAI = AM.getResult<LoopAccessAnalysis>(L);
-  OS << "Loop access info in function '" << F.getName() << "':\n";
-  OS.indent(2) << L.getHeader()->getName() << ":\n";
-  LAI.print(OS, 4);
-  return PreservedAnalyses::all();
+LoopAccessInfo LoopAccessAnalysis::run(Loop &L, LoopAnalysisManager &AM,
+                                       LoopStandardAnalysisResults &AR) {
+  return LoopAccessInfo(&L, &AR.SE, &AR.TLI, &AR.AA, &AR.DT, &AR.LI);
 }
 
 namespace llvm {
diff --git a/lib/Analysis/LoopAnalysisManager.cpp b/lib/Analysis/LoopAnalysisManager.cpp
new file mode 100644
index 000000000000..5be3ee341c9c
--- /dev/null
+++ b/lib/Analysis/LoopAnalysisManager.cpp
@@ -0,0 +1,160 @@
+//===- LoopAnalysisManager.cpp - Loop analysis management -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/IR/Dominators.h"
+
+using namespace llvm;
+
+// Explicit template instantiations and specialization defininitions for core
+// template typedefs.
+namespace llvm {
+template class AllAnalysesOn<Loop>;
+template class AnalysisManager<Loop, LoopStandardAnalysisResults &>;
+template class InnerAnalysisManagerProxy<LoopAnalysisManager, Function>;
+template class OuterAnalysisManagerProxy<FunctionAnalysisManager, Loop,
+                                         LoopStandardAnalysisResults &>;
+
+bool LoopAnalysisManagerFunctionProxy::Result::invalidate(
+    Function &F, const PreservedAnalyses &PA,
+    FunctionAnalysisManager::Invalidator &Inv) {
+  // First compute the sequence of IR units covered by this proxy. We will want
+  // to visit this in postorder, but because this is a tree structure we can do
+  // this by building a preorder sequence and walking it in reverse.
+  SmallVector<Loop *, 4> PreOrderLoops, PreOrderWorklist;
+  // Note that we want to walk the roots in reverse order because we will end
+  // up reversing the preorder sequence. However, it happens that the loop nest
+  // roots are in reverse order within the LoopInfo object. So we just walk
+  // forward here.
+  // FIXME: If we change the order of LoopInfo we will want to add a reverse
+  // here.
+  for (Loop *RootL : *LI) {
+    assert(PreOrderWorklist.empty() &&
+           "Must start with an empty preorder walk worklist.");
+    PreOrderWorklist.push_back(RootL);
+    do {
+      Loop *L = PreOrderWorklist.pop_back_val();
+      PreOrderWorklist.append(L->begin(), L->end());
+      PreOrderLoops.push_back(L);
+    } while (!PreOrderWorklist.empty());
+  }
+
+  // If this proxy or the loop info is going to be invalidated, we also need
+  // to clear all the keys coming from that analysis. We also completely blow
+  // away the loop analyses if any of the standard analyses provided by the
+  // loop pass manager go away so that loop analyses can freely use these
+  // without worrying about declaring dependencies on them etc.
+  // FIXME: It isn't clear if this is the right tradeoff. We could instead make
+  // loop analyses declare any dependencies on these and use the more general
+  // invalidation logic below to act on that.
+  auto PAC = PA.getChecker<LoopAnalysisManagerFunctionProxy>();
+  if (!(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Function>>()) ||
+      Inv.invalidate<AAManager>(F, PA) ||
+      Inv.invalidate<AssumptionAnalysis>(F, PA) ||
+      Inv.invalidate<DominatorTreeAnalysis>(F, PA) ||
+      Inv.invalidate<LoopAnalysis>(F, PA) ||
+      Inv.invalidate<ScalarEvolutionAnalysis>(F, PA)) {
+    // Note that the LoopInfo may be stale at this point, however the loop
+    // objects themselves remain the only viable keys that could be in the
+    // analysis manager's cache. So we just walk the keys and forcibly clear
+    // those results. Note that the order doesn't matter here as this will just
+    // directly destroy the results without calling methods on them.
+    for (Loop *L : PreOrderLoops)
+      InnerAM->clear(*L);
+
+    // We also need to null out the inner AM so that when the object gets
+    // destroyed as invalid we don't try to clear the inner AM again. At that
+    // point we won't be able to reliably walk the loops for this function and
+    // only clear results associated with those loops the way we do here.
+    // FIXME: Making InnerAM null at this point isn't very nice. Most analyses
+    // try to remain valid during invalidation. Maybe we should add an
+    // `IsClean` flag?
+    InnerAM = nullptr;
+
+    // Now return true to indicate this *is* invalid and a fresh proxy result
+    // needs to be built. This is especially important given the null InnerAM.
+    return true;
+  }
+
+  // Directly check if the relevant set is preserved so we can short circuit
+  // invalidating loops.
+  bool AreLoopAnalysesPreserved =
+      PA.allAnalysesInSetPreserved<AllAnalysesOn<Loop>>();
+
+  // Since we have a valid LoopInfo we can actually leave the cached results in
+  // the analysis manager associated with the Loop keys, but we need to
+  // propagate any necessary invalidation logic into them. We'd like to
+  // invalidate things in roughly the same order as they were put into the
+  // cache and so we walk the preorder list in reverse to form a valid
+  // postorder.
+  for (Loop *L : reverse(PreOrderLoops)) {
+    Optional<PreservedAnalyses> InnerPA;
+
+    // Check to see whether the preserved set needs to be adjusted based on
+    // function-level analysis invalidation triggering deferred invalidation
+    // for this loop.
+    if (auto *OuterProxy =
+            InnerAM->getCachedResult<FunctionAnalysisManagerLoopProxy>(*L))
+      for (const auto &OuterInvalidationPair :
+           OuterProxy->getOuterInvalidations()) {
+        AnalysisKey *OuterAnalysisID = OuterInvalidationPair.first;
+        const auto &InnerAnalysisIDs = OuterInvalidationPair.second;
+        if (Inv.invalidate(OuterAnalysisID, F, PA)) {
+          if (!InnerPA)
+            InnerPA = PA;
+          for (AnalysisKey *InnerAnalysisID : InnerAnalysisIDs)
+            InnerPA->abandon(InnerAnalysisID);
+        }
+      }
+
+    // Check if we needed a custom PA set. If so we'll need to run the inner
+    // invalidation.
+    if (InnerPA) {
+      InnerAM->invalidate(*L, *InnerPA);
+      continue;
+    }
+
+    // Otherwise we only need to do invalidation if the original PA set didn't
+    // preserve all Loop analyses.
+    if (!AreLoopAnalysesPreserved)
+      InnerAM->invalidate(*L, PA);
+  }
+
+  // Return false to indicate that this result is still a valid proxy.
+  return false;
+}
+
+template <>
+LoopAnalysisManagerFunctionProxy::Result
+LoopAnalysisManagerFunctionProxy::run(Function &F,
+                                      FunctionAnalysisManager &AM) {
+  return Result(*InnerAM, AM.getResult<LoopAnalysis>(F));
+}
+}
+
+PreservedAnalyses llvm::getLoopPassPreservedAnalyses() {
+  PreservedAnalyses PA;
+  PA.preserve<AssumptionAnalysis>();
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<LoopAnalysis>();
+  PA.preserve<LoopAnalysisManagerFunctionProxy>();
+  PA.preserve<ScalarEvolutionAnalysis>();
+  // TODO: What we really want to do here is preserve an AA category, but that
+  // concept doesn't exist yet.
+  PA.preserve<AAManager>();
+  PA.preserve<BasicAA>();
+  PA.preserve<GlobalsAA>();
+  PA.preserve<SCEVAA>();
+  return PA;
+}
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index 3d85ef6988a9..f449ce94d57c 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -689,18 +689,13 @@ PreservedAnalyses LoopPrinterPass::run(Function &F,
   return PreservedAnalyses::all();
 }
 
-PrintLoopPass::PrintLoopPass() : OS(dbgs()) {}
-PrintLoopPass::PrintLoopPass(raw_ostream &OS, const std::string &Banner)
-    : OS(OS), Banner(Banner) {}
-
-PreservedAnalyses PrintLoopPass::run(Loop &L, AnalysisManager<Loop> &) {
+void llvm::printLoop(Loop &L, raw_ostream &OS, const std::string &Banner) {
   OS << Banner;
   for (auto *Block : L.blocks())
     if (Block)
       Block->print(OS);
     else
       OS << "Printing <null> block";
-  return PreservedAnalyses::all();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Analysis/LoopPass.cpp b/lib/Analysis/LoopPass.cpp
index b5b8040984d7..3f4a07942154 100644
--- a/lib/Analysis/LoopPass.cpp
+++ b/lib/Analysis/LoopPass.cpp
@@ -14,7 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LLVMContext.h"
@@ -32,13 +32,14 @@ namespace {
 /// PrintLoopPass - Print a Function corresponding to a Loop.
 ///
 class PrintLoopPassWrapper : public LoopPass {
-  PrintLoopPass P;
+  raw_ostream &OS;
+  std::string Banner;
 
 public:
   static char ID;
-  PrintLoopPassWrapper() : LoopPass(ID) {}
+  PrintLoopPassWrapper() : LoopPass(ID), OS(dbgs()) {}
   PrintLoopPassWrapper(raw_ostream &OS, const std::string &Banner)
-      : LoopPass(ID), P(OS, Banner) {}
+      : LoopPass(ID), OS(OS), Banner(Banner) {}
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
@@ -49,8 +50,7 @@ public:
                        [](BasicBlock *BB) { return BB; });
     if (BBI != L->blocks().end() &&
         isFunctionInPrintList((*BBI)->getParent()->getName())) {
-      LoopAnalysisManager DummyLAM;
-      P.run(*L, DummyLAM);
+      printLoop(*L, OS, Banner);
     }
     return false;
   }
diff --git a/lib/Analysis/LoopPassManager.cpp b/lib/Analysis/LoopPassManager.cpp
deleted file mode 100644
index 044e5d55dafd..000000000000
--- a/lib/Analysis/LoopPassManager.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-//===- LoopPassManager.cpp - Loop pass management -------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/LoopPassManager.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
-#include "llvm/IR/Dominators.h"
-
-using namespace llvm;
-
-// Explicit template instantiations and specialization defininitions for core
-// template typedefs.
-namespace llvm {
-template class PassManager<Loop>;
-template class AnalysisManager<Loop>;
-template class InnerAnalysisManagerProxy<LoopAnalysisManager, Function>;
-template class OuterAnalysisManagerProxy<FunctionAnalysisManager, Loop>;
-
-template <>
-bool LoopAnalysisManagerFunctionProxy::Result::invalidate(
-    Function &F, const PreservedAnalyses &PA,
-    FunctionAnalysisManager::Invalidator &Inv) {
-  // If this proxy isn't marked as preserved, the set of Function objects in
-  // the module may have changed. We therefore can't call
-  // InnerAM->invalidate(), because any pointers to Functions it has may be
-  // stale.
-  auto PAC = PA.getChecker<LoopAnalysisManagerFunctionProxy>();
-  if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<Loop>>())
-    InnerAM->clear();
-
-  // FIXME: Proper suppor for invalidation isn't yet implemented for the LPM.
-
-  // Return false to indicate that this result is still a valid proxy.
-  return false;
-}
-}
-
-PreservedAnalyses llvm::getLoopPassPreservedAnalyses() {
-  PreservedAnalyses PA;
-  PA.preserve<DominatorTreeAnalysis>();
-  PA.preserve<LoopAnalysis>();
-  PA.preserve<ScalarEvolutionAnalysis>();
-  // TODO: What we really want to do here is preserve an AA category, but that
-  // concept doesn't exist yet.
-  PA.preserve<AAManager>();
-  PA.preserve<BasicAA>();
-  PA.preserve<GlobalsAA>();
-  PA.preserve<SCEVAA>();
-  return PA;
-}
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index e7415e623196..66a0d145dcd8 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -323,17 +323,28 @@ MemDepResult MemoryDependenceResults::getPointerDependencyFrom(
     const MemoryLocation &MemLoc, bool isLoad, BasicBlock::iterator ScanIt,
     BasicBlock *BB, Instruction *QueryInst, unsigned *Limit) {
 
+  MemDepResult InvariantGroupDependency = MemDepResult::getUnknown();
   if (QueryInst != nullptr) {
     if (auto *LI = dyn_cast<LoadInst>(QueryInst)) {
-      MemDepResult invariantGroupDependency =
-          getInvariantGroupPointerDependency(LI, BB);
+      InvariantGroupDependency = getInvariantGroupPointerDependency(LI, BB);
 
-      if (invariantGroupDependency.isDef())
-        return invariantGroupDependency;
+      if (InvariantGroupDependency.isDef())
+        return InvariantGroupDependency;
     }
   }
-  return getSimplePointerDependencyFrom(MemLoc, isLoad, ScanIt, BB, QueryInst,
-                                        Limit);
+  MemDepResult SimpleDep = getSimplePointerDependencyFrom(
+      MemLoc, isLoad, ScanIt, BB, QueryInst, Limit);
+  if (SimpleDep.isDef())
+    return SimpleDep;
+  // Non-local invariant group dependency indicates there is non local Def
+  // (it only returns nonLocal if it finds nonLocal def), which is better than
+  // local clobber and everything else.
+  if (InvariantGroupDependency.isNonLocal())
+    return InvariantGroupDependency;
+
+  assert(InvariantGroupDependency.isUnknown() &&
+         "InvariantGroupDependency should be only unknown at this point");
+  return SimpleDep;
 }
 
 MemDepResult
@@ -358,6 +369,20 @@ MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI,
   // Queue to process all pointers that are equivalent to load operand.
   SmallVector<const Value *, 8> LoadOperandsQueue;
   LoadOperandsQueue.push_back(LoadOperand);
+
+  Instruction *ClosestDependency = nullptr;
+  // Order of instructions in uses list is unpredictible. In order to always
+  // get the same result, we will look for the closest dominance.
+  auto GetClosestDependency = [this](Instruction *Best, Instruction *Other) {
+    assert(Other && "Must call it with not null instruction");
+    if (Best == nullptr || DT.dominates(Best, Other))
+      return Other;
+    return Best;
+  };
+
+
+  // FIXME: This loop is O(N^2) because dominates can be O(n) and in worst case
+  // we will see all the instructions. This should be fixed in MSSA.
   while (!LoadOperandsQueue.empty()) {
     const Value *Ptr = LoadOperandsQueue.pop_back_val();
     assert(Ptr && !isa<GlobalValue>(Ptr) &&
@@ -388,12 +413,24 @@ MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI,
       // If we hit load/store with the same invariant.group metadata (and the
       // same pointer operand) we can assume that value pointed by pointer
       // operand didn't change.
-      if ((isa<LoadInst>(U) || isa<StoreInst>(U)) && U->getParent() == BB &&
+      if ((isa<LoadInst>(U) || isa<StoreInst>(U)) &&
           U->getMetadata(LLVMContext::MD_invariant_group) == InvariantGroupMD)
-        return MemDepResult::getDef(U);
+        ClosestDependency = GetClosestDependency(ClosestDependency, U);
     }
   }
-  return MemDepResult::getUnknown();
+
+  if (!ClosestDependency)
+    return MemDepResult::getUnknown();
+  if (ClosestDependency->getParent() == BB)
+    return MemDepResult::getDef(ClosestDependency);
+  // Def(U) can't be returned here because it is non-local. If local
+  // dependency won't be found then return nonLocal counting that the
+  // user will call getNonLocalPointerDependency, which will return cached
+  // result.
+  NonLocalDefsCache.try_emplace(
+      LI, NonLocalDepResult(ClosestDependency->getParent(),
+                            MemDepResult::getDef(ClosestDependency), nullptr));
+  return MemDepResult::getNonLocal();
 }
 
 MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
@@ -877,7 +914,17 @@ void MemoryDependenceResults::getNonLocalPointerDependency(
   assert(Loc.Ptr->getType()->isPointerTy() &&
          "Can't get pointer deps of a non-pointer!");
   Result.clear();
-
+  {
+    // Check if there is cached Def with invariant.group. FIXME: cache might be
+    // invalid if cached instruction would be removed between call to
+    // getPointerDependencyFrom and this function.
+    auto NonLocalDefIt = NonLocalDefsCache.find(QueryInst);
+    if (NonLocalDefIt != NonLocalDefsCache.end()) {
+      Result.push_back(std::move(NonLocalDefIt->second));
+      NonLocalDefsCache.erase(NonLocalDefIt);
+      return;
+    }
+  }
   // This routine does not expect to deal with volatile instructions.
   // Doing so would require piping through the QueryInst all the way through.
   // TODO: volatiles can't be elided, but they can be reordered with other
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 44f1a6dde0d2..b3905cc01e84 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -7032,20 +7032,21 @@ static const SCEV *SolveLinEquationWithOverflow(const APInt &A, const APInt &B,
   // 3. Compute I: the multiplicative inverse of (A / D) in arithmetic
   // modulo (N / D).
   //
-  // (N / D) may need BW+1 bits in its representation.  Hence, we'll use this
-  // bit width during computations.
+  // If D == 1, (N / D) == N == 2^BW, so we need one extra bit to represent
+  // (N / D) in general. The inverse itself always fits into BW bits, though,
+  // so we immediately truncate it.
   APInt AD = A.lshr(Mult2).zext(BW + 1);  // AD = A / D
   APInt Mod(BW + 1, 0);
   Mod.setBit(BW - Mult2);  // Mod = N / D
-  APInt I = AD.multiplicativeInverse(Mod);
+  APInt I = AD.multiplicativeInverse(Mod).trunc(BW);
 
   // 4. Compute the minimum unsigned root of the equation:
   // I * (B / D) mod (N / D)
-  APInt Result = (I * B.lshr(Mult2).zext(BW + 1)).urem(Mod);
+  // To simplify the computation, we factor out the divide by D:
+  // (I * B mod N) / D
+  APInt Result = (I * B).lshr(Mult2);
 
-  // The result is guaranteed to be less than 2^BW so we may truncate it to BW
-  // bits.
-  return SE.getConstant(Result.trunc(BW));
+  return SE.getConstant(Result);
 }
 
 /// Find the roots of the quadratic equation for the given quadratic chrec
@@ -7206,17 +7207,25 @@ ScalarEvolution::howFarToZero(const SCEV *V, const Loop *L, bool ControlsExit,
   // 1*N = -Start; -1*N = Start (mod 2^BW), so:
   //   N = Distance (as unsigned)
   if (StepC->getValue()->equalsInt(1) || StepC->getValue()->isAllOnesValue()) {
-    ConstantRange CR = getUnsignedRange(Start);
-    const SCEV *MaxBECount;
-    if (!CountDown && CR.getUnsignedMin().isMinValue())
-      // When counting up, the worst starting value is 1, not 0.
-      MaxBECount = CR.getUnsignedMax().isMinValue()
-        ? getConstant(APInt::getMinValue(CR.getBitWidth()))
-        : getConstant(APInt::getMaxValue(CR.getBitWidth()));
-    else
-      MaxBECount = getConstant(CountDown ? CR.getUnsignedMax()
-                                         : -CR.getUnsignedMin());
-    return ExitLimit(Distance, MaxBECount, false, Predicates);
+    APInt MaxBECount = getUnsignedRange(Distance).getUnsignedMax();
+
+    // When a loop like "for (int i = 0; i != n; ++i) { /* body */ }" is rotated,
+    // we end up with a loop whose backedge-taken count is n - 1.  Detect this
+    // case, and see if we can improve the bound.
+    //
+    // Explicitly handling this here is necessary because getUnsignedRange
+    // isn't context-sensitive; it doesn't know that we only care about the
+    // range inside the loop.
+    const SCEV *Zero = getZero(Distance->getType());
+    const SCEV *One = getOne(Distance->getType());
+    const SCEV *DistancePlusOne = getAddExpr(Distance, One);
+    if (isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, DistancePlusOne, Zero)) {
+      // If Distance + 1 doesn't overflow, we can compute the maximum distance
+      // as "unsigned_max(Distance + 1) - 1".
+      ConstantRange CR = getUnsignedRange(DistancePlusOne);
+      MaxBECount = APIntOps::umin(MaxBECount, CR.getUnsignedMax() - 1);
+    }
+    return ExitLimit(Distance, getConstant(MaxBECount), false, Predicates);
   }
 
   // As a special case, handle the instance where Step is a positive power of
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index cd8c24630df1..5c0d1aac1b98 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -277,9 +277,10 @@ unsigned TargetTransformInfo::getMaxInterleaveFactor(unsigned VF) const {
 int TargetTransformInfo::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
     OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo,
-    OperandValueProperties Opd2PropInfo) const {
+    OperandValueProperties Opd2PropInfo,
+    ArrayRef<const Value *> Args) const {
   int Cost = TTIImpl->getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
-                                             Opd1PropInfo, Opd2PropInfo);
+                                             Opd1PropInfo, Opd2PropInfo, Args);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index d31472c0d33c..b79370baad10 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -526,7 +526,10 @@ static void computeKnownBitsFromAssume(const Value *V, APInt &KnownZero,
 
   unsigned BitWidth = KnownZero.getBitWidth();
 
-  for (auto &AssumeVH : Q.AC->assumptions()) {
+  // Note that the patterns below need to be kept in sync with the code
+  // in AssumptionCache::updateAffectedValues.
+
+  for (auto &AssumeVH : Q.AC->assumptionsFor(V)) {
     if (!AssumeVH)
       continue;
     CallInst *I = cast<CallInst>(AssumeVH);
@@ -2580,51 +2583,70 @@ bool llvm::CannotBeNegativeZero(const Value *V, const TargetLibraryInfo *TLI,
   return false;
 }
 
-bool llvm::CannotBeOrderedLessThanZero(const Value *V,
-                                       const TargetLibraryInfo *TLI,
-                                       unsigned Depth) {
-  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(V))
-    return !CFP->getValueAPF().isNegative() || CFP->getValueAPF().isZero();
+/// If \p SignBitOnly is true, test for a known 0 sign bit rather than a
+/// standard ordered compare. e.g. make -0.0 olt 0.0 be true because of the sign
+/// bit despite comparing equal.
+static bool cannotBeOrderedLessThanZeroImpl(const Value *V,
+                                            const TargetLibraryInfo *TLI,
+                                            bool SignBitOnly,
+                                            unsigned Depth) {
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
+    return !CFP->getValueAPF().isNegative() ||
+           (!SignBitOnly && CFP->getValueAPF().isZero());
+  }
 
   if (Depth == MaxDepth)
-    return false;  // Limit search depth.
+    return false; // Limit search depth.
 
   const Operator *I = dyn_cast<Operator>(V);
-  if (!I) return false;
+  if (!I)
+    return false;
 
   switch (I->getOpcode()) {
-  default: break;
+  default:
+    break;
   // Unsigned integers are always nonnegative.
   case Instruction::UIToFP:
     return true;
   case Instruction::FMul:
     // x*x is always non-negative or a NaN.
-    if (I->getOperand(0) == I->getOperand(1))
+    if (I->getOperand(0) == I->getOperand(1) &&
+        (!SignBitOnly || cast<FPMathOperator>(I)->hasNoNaNs()))
       return true;
+
     LLVM_FALLTHROUGH;
   case Instruction::FAdd:
   case Instruction::FDiv:
   case Instruction::FRem:
-    return CannotBeOrderedLessThanZero(I->getOperand(0), TLI, Depth + 1) &&
-           CannotBeOrderedLessThanZero(I->getOperand(1), TLI, Depth + 1);
+    return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
+                                           Depth + 1) &&
+           cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI, SignBitOnly,
+                                           Depth + 1);
   case Instruction::Select:
-    return CannotBeOrderedLessThanZero(I->getOperand(1), TLI, Depth + 1) &&
-           CannotBeOrderedLessThanZero(I->getOperand(2), TLI, Depth + 1);
+    return cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI, SignBitOnly,
+                                           Depth + 1) &&
+           cannotBeOrderedLessThanZeroImpl(I->getOperand(2), TLI, SignBitOnly,
+                                           Depth + 1);
   case Instruction::FPExt:
   case Instruction::FPTrunc:
     // Widening/narrowing never change sign.
-    return CannotBeOrderedLessThanZero(I->getOperand(0), TLI, Depth + 1);
+    return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
+                                           Depth + 1);
   case Instruction::Call:
     Intrinsic::ID IID = getIntrinsicForCallSite(cast<CallInst>(I), TLI);
     switch (IID) {
     default:
       break;
     case Intrinsic::maxnum:
-      return CannotBeOrderedLessThanZero(I->getOperand(0), TLI, Depth + 1) ||
-             CannotBeOrderedLessThanZero(I->getOperand(1), TLI, Depth + 1);
+      return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
+                                             Depth + 1) ||
+             cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI, SignBitOnly,
+                                             Depth + 1);
     case Intrinsic::minnum:
-      return CannotBeOrderedLessThanZero(I->getOperand(0), TLI, Depth + 1) &&
-             CannotBeOrderedLessThanZero(I->getOperand(1), TLI, Depth + 1);
+      return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
+                                             Depth + 1) &&
+             cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI, SignBitOnly,
+                                             Depth + 1);
     case Intrinsic::exp:
     case Intrinsic::exp2:
     case Intrinsic::fabs:
@@ -2636,18 +2658,30 @@ bool llvm::CannotBeOrderedLessThanZero(const Value *V,
         if (CI->getBitWidth() <= 64 && CI->getSExtValue() % 2u == 0)
           return true;
       }
-      return CannotBeOrderedLessThanZero(I->getOperand(0), TLI, Depth + 1);
+      return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
+                                             Depth + 1);
     case Intrinsic::fma:
     case Intrinsic::fmuladd:
       // x*x+y is non-negative if y is non-negative.
       return I->getOperand(0) == I->getOperand(1) &&
-             CannotBeOrderedLessThanZero(I->getOperand(2), TLI, Depth + 1);
+             (!SignBitOnly || cast<FPMathOperator>(I)->hasNoNaNs()) &&
+             cannotBeOrderedLessThanZeroImpl(I->getOperand(2), TLI, SignBitOnly,
+                                             Depth + 1);
     }
     break;
   }
   return false;
 }
 
+bool llvm::CannotBeOrderedLessThanZero(const Value *V,
+                                       const TargetLibraryInfo *TLI) {
+  return cannotBeOrderedLessThanZeroImpl(V, TLI, false, 0);
+}
+
+bool llvm::SignBitMustBeZero(const Value *V, const TargetLibraryInfo *TLI) {
+  return cannotBeOrderedLessThanZeroImpl(V, TLI, true, 0);
+}
+
 /// If the specified value can be set by repeating the same byte in memory,
 /// return the i8 value that it is represented with.  This is
 /// true for all i8 values obviously, but is also true for i32 0, i32 -1,
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 54aa0a9e3282..76549540ce0f 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -22,3 +22,4 @@ add_subdirectory(ProfileData)
 add_subdirectory(Fuzzer)
 add_subdirectory(Passes)
 add_subdirectory(LibDriver)
+add_subdirectory(XRay)
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 408b34a3cdc0..83440513225c 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -13,11 +13,13 @@
 
 #include "CodeViewDebug.h"
 #include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/DebugInfo/CodeView/CVTypeDumper.h"
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/Line.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
-#include "llvm/DebugInfo/CodeView/TypeDumper.h"
+#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
+#include "llvm/DebugInfo/CodeView/TypeDumpVisitor.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
@@ -467,7 +469,8 @@ void CodeViewDebug::emitTypeInformation() {
     CommentPrefix += ' ';
   }
 
-  CVTypeDumper CVTD(nullptr, /*PrintRecordBytes=*/false);
+  TypeDatabase TypeDB;
+  CVTypeDumper CVTD(TypeDB);
   TypeTable.ForEachRecord([&](TypeIndex Index, ArrayRef<uint8_t> Record) {
     if (OS.isVerboseAsm()) {
       // Emit a block comment describing the type record for readability.
@@ -475,8 +478,8 @@ void CodeViewDebug::emitTypeInformation() {
       raw_svector_ostream CommentOS(CommentBlock);
       ScopedPrinter SP(CommentOS);
       SP.setPrefix(CommentPrefix);
-      CVTD.setPrinter(&SP);
-      Error E = CVTD.dump(Record);
+      TypeDumpVisitor TDV(TypeDB, &SP, false);
+      Error E = CVTD.dump(Record, TDV);
       if (E) {
         logAllUnhandledErrors(std::move(E), errs(), "error: ");
         llvm_unreachable("produced malformed type record");
diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp
index 8ae2f2487cad..a8a3b30d5b60 100644
--- a/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -79,6 +79,13 @@ void DIEAbbrev::Emit(const AsmPrinter *AP) const {
     // Emit form type.
     AP->EmitULEB128(AttrData.getForm(),
                     dwarf::FormEncodingString(AttrData.getForm()).data());
+
+    // Emit value for DW_FORM_implicit_const.
+    if (AttrData.getForm() == dwarf::DW_FORM_implicit_const) {
+      assert(AP->getDwarfVersion() >= 5 &&
+            "DW_FORM_implicit_const is supported starting from DWARFv5");
+      AP->EmitSLEB128(AttrData.getValue());
+    }
   }
 
   // Mark end of abbreviation.
@@ -160,7 +167,11 @@ DIE *DIE::getParent() const {
 DIEAbbrev DIE::generateAbbrev() const {
   DIEAbbrev Abbrev(Tag, hasChildren());
   for (const DIEValue &V : values())
-    Abbrev.AddAttribute(V.getAttribute(), V.getForm());
+    if (V.getForm() == dwarf::DW_FORM_implicit_const)
+      Abbrev.AddImplicitConstAttribute(V.getAttribute(),
+                                       V.getDIEInteger().getValue());
+    else
+      Abbrev.AddAttribute(V.getAttribute(), V.getForm());
   return Abbrev;
 }
 
@@ -342,6 +353,8 @@ void DIEValue::dump() const {
 ///
 void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
   switch (Form) {
+  case dwarf::DW_FORM_implicit_const:
+    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_flag_present:
     // Emit something to keep the lines and comments in sync.
     // FIXME: Is there a better way to do this?
@@ -406,6 +419,7 @@ void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
 ///
 unsigned DIEInteger::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
   switch (Form) {
+  case dwarf::DW_FORM_implicit_const: LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_flag_present: return 0;
   case dwarf::DW_FORM_flag:  LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_ref1:  LLVM_FALLTHROUGH;
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 4f90245c6d49..2a866c071f59 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -200,6 +200,8 @@ void DwarfUnit::addUInt(DIEValueList &Die, dwarf::Attribute Attribute,
                         Optional<dwarf::Form> Form, uint64_t Integer) {
   if (!Form)
     Form = DIEInteger::BestForm(false, Integer);
+  assert(Form != dwarf::DW_FORM_implicit_const &&
+         "DW_FORM_implicit_const is used only for signed integers");
   Die.addValue(DIEValueAllocator, Attribute, *Form, DIEInteger(Integer));
 }
 
diff --git a/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
index 04bb7ca5ba9e..cc026ef27296 100644
--- a/lib/CodeGen/GlobalISel/RegBankSelect.cpp
+++ b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
@@ -223,6 +223,7 @@ RegisterBankInfo::InstructionMapping &RegBankSelect::findBestMapping(
   for (RegisterBankInfo::InstructionMapping &CurMapping : PossibleMappings) {
     MappingCost CurCost = computeMapping(MI, CurMapping, LocalRepairPts, &Cost);
     if (CurCost < Cost) {
+      DEBUG(dbgs() << "New best: " << CurCost << '\n');
       Cost = CurCost;
       BestMapping = &CurMapping;
       RepairPts.clear();
@@ -377,8 +378,10 @@ RegBankSelect::MappingCost RegBankSelect::computeMapping(
   DEBUG(dbgs() << "Evaluating mapping cost for: " << MI);
   DEBUG(dbgs() << "With: " << InstrMapping << '\n');
   RepairPts.clear();
-  if (BestCost && Cost > *BestCost)
+  if (BestCost && Cost > *BestCost) {
+    DEBUG(dbgs() << "Mapping is too expensive from the start\n");
     return Cost;
+  }
 
   // Moreover, to realize this mapping, the register bank of each operand must
   // match this mapping. In other words, we may need to locally reassign the
@@ -392,17 +395,17 @@ RegBankSelect::MappingCost RegBankSelect::computeMapping(
     unsigned Reg = MO.getReg();
     if (!Reg)
       continue;
-    DEBUG(dbgs() << "Opd" << OpIdx);
+    DEBUG(dbgs() << "Opd" << OpIdx << '\n');
     const RegisterBankInfo::ValueMapping &ValMapping =
         InstrMapping.getOperandMapping(OpIdx);
     // If Reg is already properly mapped, this is free.
     bool Assign;
     if (assignmentMatch(Reg, ValMapping, Assign)) {
-      DEBUG(dbgs() << " is free (match).\n");
+      DEBUG(dbgs() << "=> is free (match).\n");
       continue;
     }
     if (Assign) {
-      DEBUG(dbgs() << " is free (simple assignment).\n");
+      DEBUG(dbgs() << "=> is free (simple assignment).\n");
       RepairPts.emplace_back(RepairingPlacement(MI, OpIdx, *TRI, *this,
                                                 RepairingPlacement::Reassign));
       continue;
@@ -420,8 +423,10 @@ RegBankSelect::MappingCost RegBankSelect::computeMapping(
       tryAvoidingSplit(RepairPt, MO, ValMapping);
 
     // Check that the materialization of the repairing is possible.
-    if (!RepairPt.canMaterialize())
+    if (!RepairPt.canMaterialize()) {
+      DEBUG(dbgs() << "Mapping involves impossible repairing\n");
       return MappingCost::ImpossibleCost();
+    }
 
     // Account for the split cost and repair cost.
     // Unless the cost is already saturated or we do not care about the cost.
@@ -476,8 +481,10 @@ RegBankSelect::MappingCost RegBankSelect::computeMapping(
 
       // Stop looking into what it takes to repair, this is already
       // too expensive.
-      if (BestCost && Cost > *BestCost)
+      if (BestCost && Cost > *BestCost) {
+        DEBUG(dbgs() << "Mapping is too expensive, stop processing\n");
         return Cost;
+      }
 
       // No need to accumulate more cost information.
       // We need to still gather the repairing information though.
@@ -485,6 +492,7 @@ RegBankSelect::MappingCost RegBankSelect::computeMapping(
         break;
     }
   }
+  DEBUG(dbgs() << "Total cost is: " << Cost << "\n");
   return Cost;
 }
 
@@ -550,7 +558,7 @@ bool RegBankSelect::assignInstr(MachineInstr &MI) {
   // Make sure the mapping is valid for MI.
   assert(BestMapping.verify(MI) && "Invalid instruction mapping");
 
-  DEBUG(dbgs() << "Mapping: " << BestMapping << '\n');
+  DEBUG(dbgs() << "Best Mapping: " << BestMapping << '\n');
 
   // After this call, MI may not be valid anymore.
   // Do not use it.
@@ -959,3 +967,20 @@ bool RegBankSelect::MappingCost::operator==(const MappingCost &Cost) const {
   return LocalCost == Cost.LocalCost && NonLocalCost == Cost.NonLocalCost &&
          LocalFreq == Cost.LocalFreq;
 }
+
+void RegBankSelect::MappingCost::dump() const {
+  print(dbgs());
+  dbgs() << '\n';
+}
+
+void RegBankSelect::MappingCost::print(raw_ostream &OS) const {
+  if (*this == ImpossibleCost()) {
+    OS << "impossible";
+    return;
+  }
+  if (isSaturated()) {
+    OS << "saturated";
+    return;
+  }
+  OS << LocalFreq << " * " << LocalCost << " + " << NonLocalCost;
+}
diff --git a/lib/CodeGen/GlobalISel/RegisterBank.cpp b/lib/CodeGen/GlobalISel/RegisterBank.cpp
index 0ffc08188ead..49d676f11da6 100644
--- a/lib/CodeGen/GlobalISel/RegisterBank.cpp
+++ b/lib/CodeGen/GlobalISel/RegisterBank.cpp
@@ -19,12 +19,15 @@ using namespace llvm;
 
 const unsigned RegisterBank::InvalidID = UINT_MAX;
 
-RegisterBank::RegisterBank() : ID(InvalidID), Name(nullptr), Size(0) {}
+RegisterBank::RegisterBank(unsigned ID, const char *Name, unsigned Size,
+                           const uint32_t *CoveredClasses)
+    : ID(ID), Name(Name), Size(Size) {
+  ContainedRegClasses.resize(200);
+  ContainedRegClasses.setBitsInMask(CoveredClasses);
+}
 
 bool RegisterBank::verify(const TargetRegisterInfo &TRI) const {
   assert(isValid() && "Invalid register bank");
-  assert(ContainedRegClasses.size() == TRI.getNumRegClasses() &&
-         "TRI does not match the initialization process?");
   for (unsigned RCId = 0, End = TRI.getNumRegClasses(); RCId != End; ++RCId) {
     const TargetRegisterClass &RC = *TRI.getRegClass(RCId);
 
diff --git a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
index 7d405dd92ac3..da5ab0b9fb7b 100644
--- a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
+++ b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
@@ -56,8 +56,10 @@ RegisterBankInfo::RegisterBankInfo(RegisterBank **RegBanks,
                                    unsigned NumRegBanks)
     : RegBanks(RegBanks), NumRegBanks(NumRegBanks) {
 #ifndef NDEBUG
-  for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx)
+  for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) {
     assert(RegBanks[Idx] != nullptr && "Invalid RegisterBank");
+    assert(RegBanks[Idx]->isValid() && "RegisterBank should be valid");
+  }
 #endif // NDEBUG
 }
 
@@ -74,116 +76,13 @@ bool RegisterBankInfo::verify(const TargetRegisterInfo &TRI) const {
     const RegisterBank &RegBank = getRegBank(Idx);
     assert(Idx == RegBank.getID() &&
            "ID does not match the index in the array");
-    dbgs() << "Verify " << RegBank << '\n';
+    DEBUG(dbgs() << "Verify " << RegBank << '\n');
     assert(RegBank.verify(TRI) && "RegBank is invalid");
   }
 #endif // NDEBUG
   return true;
 }
 
-void RegisterBankInfo::createRegisterBank(unsigned ID, const char *Name) {
-  DEBUG(dbgs() << "Create register bank: " << ID << " with name \"" << Name
-               << "\"\n");
-  RegisterBank &RegBank = getRegBank(ID);
-  assert(RegBank.getID() == RegisterBank::InvalidID &&
-         "A register bank should be created only once");
-  RegBank.ID = ID;
-  RegBank.Name = Name;
-}
-
-void RegisterBankInfo::addRegBankCoverage(unsigned ID, unsigned RCId,
-                                          const TargetRegisterInfo &TRI) {
-  RegisterBank &RB = getRegBank(ID);
-  unsigned NbOfRegClasses = TRI.getNumRegClasses();
-
-  DEBUG(dbgs() << "Add coverage for: " << RB << '\n');
-
-  // Check if RB is underconstruction.
-  if (!RB.isValid())
-    RB.ContainedRegClasses.resize(NbOfRegClasses);
-  else if (RB.covers(*TRI.getRegClass(RCId)))
-    // If RB already covers this register class, there is nothing
-    // to do.
-    return;
-
-  BitVector &Covered = RB.ContainedRegClasses;
-  SmallVector<unsigned, 8> WorkList;
-
-  WorkList.push_back(RCId);
-  Covered.set(RCId);
-
-  unsigned &MaxSize = RB.Size;
-  do {
-    unsigned RCId = WorkList.pop_back_val();
-
-    const TargetRegisterClass &CurRC = *TRI.getRegClass(RCId);
-
-    DEBUG(dbgs() << "Examine: " << TRI.getRegClassName(&CurRC)
-                 << "(Size*8: " << (CurRC.getSize() * 8) << ")\n");
-
-    // Remember the biggest size in bits.
-    MaxSize = std::max(MaxSize, CurRC.getSize() * 8);
-
-    // Walk through all sub register classes and push them into the worklist.
-    bool First = true;
-    for (BitMaskClassIterator It(CurRC.getSubClassMask(), TRI); It.isValid();
-         ++It) {
-      unsigned SubRCId = It.getID();
-      if (!Covered.test(SubRCId)) {
-        if (First)
-          DEBUG(dbgs() << "  Enqueue sub-class: ");
-        DEBUG(dbgs() << TRI.getRegClassName(TRI.getRegClass(SubRCId)) << ", ");
-        WorkList.push_back(SubRCId);
-        // Remember that we saw the sub class.
-        Covered.set(SubRCId);
-        First = false;
-      }
-    }
-    if (!First)
-      DEBUG(dbgs() << '\n');
-
-    // Push also all the register classes that can be accessed via a
-    // subreg index, i.e., its subreg-class (which is different than
-    // its subclass).
-    //
-    // Note: It would probably be faster to go the other way around
-    // and have this method add only super classes, since this
-    // information is available in a more efficient way. However, it
-    // feels less natural for the client of this APIs plus we will
-    // TableGen the whole bitset at some point, so compile time for
-    // the initialization is not very important.
-    First = true;
-    for (unsigned SubRCId = 0; SubRCId < NbOfRegClasses; ++SubRCId) {
-      if (Covered.test(SubRCId))
-        continue;
-      bool Pushed = false;
-      const TargetRegisterClass *SubRC = TRI.getRegClass(SubRCId);
-      for (SuperRegClassIterator SuperRCIt(SubRC, &TRI); SuperRCIt.isValid();
-           ++SuperRCIt) {
-        if (Pushed)
-          break;
-        for (BitMaskClassIterator It(SuperRCIt.getMask(), TRI); It.isValid();
-             ++It) {
-          unsigned SuperRCId = It.getID();
-          if (SuperRCId == RCId) {
-            if (First)
-              DEBUG(dbgs() << "  Enqueue subreg-class: ");
-            DEBUG(dbgs() << TRI.getRegClassName(SubRC) << ", ");
-            WorkList.push_back(SubRCId);
-            // Remember that we saw the sub class.
-            Covered.set(SubRCId);
-            Pushed = true;
-            First = false;
-            break;
-          }
-        }
-      }
-    }
-    if (!First)
-      DEBUG(dbgs() << '\n');
-  } while (!WorkList.empty());
-}
-
 const RegisterBank *
 RegisterBankInfo::getRegBank(unsigned Reg, const MachineRegisterInfo &MRI,
                              const TargetRegisterInfo &TRI) const {
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index d2ce001103df..2f2e3b3d8e9f 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -1840,7 +1840,8 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
         OS << "!\"" << DIV->getName() << '\"';
       else
         MO.print(OS, MST, TRI);
-    } else if (TRI && (isInsertSubreg() || isRegSequence()) && MO.isImm()) {
+    } else if (TRI && (isInsertSubreg() || isRegSequence() ||
+                       (isSubregToReg() && i == 3)) && MO.isImm()) {
       OS << TRI->getSubRegIndexName(MO.getImm());
     } else if (i == AsmDescOp && MO.isImm()) {
       // Pretty print the inline asm operand descriptor.
diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index 11af50fe577c..6d643457e9a9 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -1715,7 +1715,8 @@ ValueTrackerResult ValueTracker::getNextSourceFromBitcast() {
   // Bitcasts with more than one def are not supported.
   if (Def->getDesc().getNumDefs() != 1)
     return ValueTrackerResult();
-  if (Def->getOperand(DefIdx).getSubReg() != DefSubReg)
+  const MachineOperand DefOp = Def->getOperand(DefIdx);
+  if (DefOp.getSubReg() != DefSubReg)
     // If we look for a different subreg, it means we want a subreg of the src.
     // Bails as we do not support composing subregs yet.
     return ValueTrackerResult();
@@ -1735,6 +1736,14 @@ ValueTrackerResult ValueTracker::getNextSourceFromBitcast() {
       return ValueTrackerResult();
     SrcIdx = OpIdx;
   }
+
+  // Stop when any user of the bitcast is a SUBREG_TO_REG, replacing with a COPY
+  // will break the assumed guarantees for the upper bits.
+  for (const MachineInstr &UseMI : MRI.use_nodbg_instructions(DefOp.getReg())) {
+    if (UseMI.isSubregToReg())
+      return ValueTrackerResult();
+  }
+
   const MachineOperand &Src = Def->getOperand(SrcIdx);
   return ValueTrackerResult(Src.getReg(), Src.getSubReg());
 }
diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp
index 1f0c3283ceb1..427d95268c74 100644
--- a/lib/CodeGen/ScheduleDAG.cpp
+++ b/lib/CodeGen/ScheduleDAG.cpp
@@ -310,19 +310,19 @@ void SUnit::biasCriticalPath() {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-static void dumpSUIdentifier(const ScheduleDAG &DAG, const SUnit &SU) {
-  if (&SU == &DAG.ExitSU)
-    dbgs() << "ExitSU";
-  else if (&SU == &DAG.EntrySU)
-    dbgs() << "EntrySU";
+void SUnit::print(raw_ostream &OS, const ScheduleDAG *DAG) const {
+  if (this == &DAG->ExitSU)
+    OS << "ExitSU";
+  else if (this == &DAG->EntrySU)
+    OS << "EntrySU";
   else
-    dbgs() << "SU(" << SU.NodeNum << ")";
+    OS << "SU(" << NodeNum << ")";
 }
 
 /// SUnit - Scheduling unit. It's an wrapper around either a single SDNode or
 /// a group of nodes flagged together.
 void SUnit::dump(const ScheduleDAG *G) const {
-  dumpSUIdentifier(*G, *this);
+  print(dbgs(), G);
   dbgs() << ": ";
   G->dumpNode(this);
 }
@@ -352,7 +352,7 @@ void SUnit::dumpAll(const ScheduleDAG *G) const {
       case SDep::Output: dbgs() << "out  "; break;
       case SDep::Order:  dbgs() << "ord  "; break;
       }
-      dumpSUIdentifier(*G, *I->getSUnit());
+      I->getSUnit()->print(dbgs(), G);
       if (I->isArtificial())
         dbgs() << " *";
       dbgs() << ": Latency=" << I->getLatency();
@@ -372,7 +372,7 @@ void SUnit::dumpAll(const ScheduleDAG *G) const {
       case SDep::Output: dbgs() << "out  "; break;
       case SDep::Order:  dbgs() << "ord  "; break;
       }
-      dumpSUIdentifier(*G, *I->getSUnit());
+      I->getSUnit()->print(dbgs(), G);
       if (I->isArtificial())
         dbgs() << " *";
       dbgs() << ": Latency=" << I->getLatency();
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4632484055d2..680f62fa91bc 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5361,8 +5361,9 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
     // fold (select false, X, Y) -> Y
     return !N0C->isNullValue() ? N1 : N2;
   }
-  // fold (select C, 1, X) -> (or C, X)
-  if (VT == MVT::i1 && isOneConstant(N1))
+  // fold (select X, X, Y) -> (or X, Y)
+  // fold (select X, 1, Y) -> (or C, Y)
+  if (VT == VT0 && VT == MVT::i1 && (N0 == N1 || isOneConstant(N1)))
     return DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N2);
 
   if (SDValue V = foldSelectOfConstants(N))
@@ -5380,16 +5381,9 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
     AddToWorklist(NOTNode.getNode());
     return DAG.getNode(ISD::OR, SDLoc(N), VT, NOTNode, N1);
   }
-  // fold (select C, X, 0) -> (and C, X)
-  if (VT == MVT::i1 && isNullConstant(N2))
-    return DAG.getNode(ISD::AND, SDLoc(N), VT, N0, N1);
-  // fold (select X, X, Y) -> (or X, Y)
-  // fold (select X, 1, Y) -> (or X, Y)
-  if (VT == MVT::i1 && (N0 == N1 || isOneConstant(N1)))
-    return DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N2);
   // fold (select X, Y, X) -> (and X, Y)
   // fold (select X, Y, 0) -> (and X, Y)
-  if (VT == MVT::i1 && (N0 == N2 || isNullConstant(N2)))
+  if (VT == VT0 && VT == MVT::i1 && (N0 == N2 || isNullConstant(N2)))
     return DAG.getNode(ISD::AND, SDLoc(N), VT, N0, N1);
 
   // If we can fold this based on the true/false value, do so.
@@ -5470,7 +5464,6 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
   }
 
   // select (xor Cond, 1), X, Y -> select Cond, Y, X
-  // select (xor Cond, 0), X, Y -> selext Cond, X, Y
   if (VT0 == MVT::i1) {
     if (N0->getOpcode() == ISD::XOR) {
       if (auto *C = dyn_cast<ConstantSDNode>(N0->getOperand(1))) {
@@ -5478,9 +5471,6 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
         if (C->isOne())
           return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(),
                              Cond0, N2, N1);
-        else
-          return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(),
-                             Cond0, N1, N2);
       }
     }
   }
@@ -8136,7 +8126,8 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   if ((AllowFusion || HasFMAD)  && Aggressive) {
     // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, z))
     if (N0.getOpcode() == PreferredFusedOpcode &&
-        N0.getOperand(2).getOpcode() == ISD::FMUL) {
+        N0.getOperand(2).getOpcode() == ISD::FMUL &&
+        N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
                          N0.getOperand(0), N0.getOperand(1),
                          DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -8147,7 +8138,8 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
 
     // fold (fadd x, (fma y, z, (fmul u, v)) -> (fma y, z (fma u, v, x))
     if (N1->getOpcode() == PreferredFusedOpcode &&
-        N1.getOperand(2).getOpcode() == ISD::FMUL) {
+        N1.getOperand(2).getOpcode() == ISD::FMUL &&
+        N1->hasOneUse() && N1.getOperand(2)->hasOneUse()) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
                          N1.getOperand(0), N1.getOperand(1),
                          DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -8379,7 +8371,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
     // fold (fsub (fma x, y, (fmul u, v)), z)
     //   -> (fma x, y (fma u, v, (fneg z)))
     if (N0.getOpcode() == PreferredFusedOpcode &&
-        N0.getOperand(2).getOpcode() == ISD::FMUL) {
+        N0.getOperand(2).getOpcode() == ISD::FMUL &&
+        N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
                          N0.getOperand(0), N0.getOperand(1),
                          DAG.getNode(PreferredFusedOpcode, SL, VT,
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 3485e35e6f5d..b0028252836a 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -330,8 +330,6 @@ SDValue SelectionDAGLegalize::PerformInsertVectorEltInMemory(SDValue Vec,
   // supported by the target.
   EVT VT    = Tmp1.getValueType();
   EVT EltVT = VT.getVectorElementType();
-  EVT IdxVT = Tmp3.getValueType();
-  EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
   SDValue StackPtr = DAG.CreateStackTemporary(VT);
 
   int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
@@ -341,13 +339,8 @@ SDValue SelectionDAGLegalize::PerformInsertVectorEltInMemory(SDValue Vec,
       DAG.getEntryNode(), dl, Tmp1, StackPtr,
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI));
 
-  // Truncate or zero extend offset to target pointer type.
-  Tmp3 = DAG.getZExtOrTrunc(Tmp3, dl, PtrVT);
-  // Add the offset to the index.
-  unsigned EltSize = EltVT.getSizeInBits()/8;
-  Tmp3 = DAG.getNode(ISD::MUL, dl, IdxVT, Tmp3,
-                     DAG.getConstant(EltSize, dl, IdxVT));
-  SDValue StackPtr2 = DAG.getNode(ISD::ADD, dl, IdxVT, Tmp3, StackPtr);
+  SDValue StackPtr2 = TLI.getVectorElementPointer(DAG, StackPtr, VT, Tmp3);
+
   // Store the scalar value.
   Ch = DAG.getTruncStore(Ch, dl, Tmp2, StackPtr2, MachinePointerInfo(), EltVT);
   // Load the updated vector.
@@ -1209,20 +1202,16 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
     }
   }
 
+  EVT VecVT = Vec.getValueType();
+
   if (!Ch.getNode()) {
     // Store the value to a temporary stack slot, then LOAD the returned part.
-    StackPtr = DAG.CreateStackTemporary(Vec.getValueType());
+    StackPtr = DAG.CreateStackTemporary(VecVT);
     Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr,
                       MachinePointerInfo());
   }
 
-  // Add the offset to the index.
-  unsigned EltSize = Vec.getScalarValueSizeInBits() / 8;
-  Idx = DAG.getNode(ISD::MUL, dl, Idx.getValueType(), Idx,
-                    DAG.getConstant(EltSize, SDLoc(Vec), Idx.getValueType()));
-
-  Idx = DAG.getZExtOrTrunc(Idx, dl, TLI.getPointerTy(DAG.getDataLayout()));
-  StackPtr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, StackPtr);
+  StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
 
   SDValue NewLoad;
 
@@ -1232,7 +1221,7 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
   else
     NewLoad = DAG.getExtLoad(ISD::EXTLOAD, dl, Op.getValueType(), Ch, StackPtr,
                              MachinePointerInfo(),
-                             Vec.getValueType().getVectorElementType());
+                             VecVT.getVectorElementType());
 
   // Replace the chain going out of the store, by the one out of the load.
   DAG.ReplaceAllUsesOfValueWith(Ch, SDValue(NewLoad.getNode(), 1));
@@ -1256,8 +1245,8 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
   SDLoc dl(Op);
 
   // Store the value to a temporary stack slot, then LOAD the returned part.
-
-  SDValue StackPtr = DAG.CreateStackTemporary(Vec.getValueType());
+  EVT VecVT = Vec.getValueType();
+  SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
   int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
   MachinePointerInfo PtrInfo =
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
@@ -1266,16 +1255,7 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
   SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo);
 
   // Then store the inserted part.
-
-  // Add the offset to the index.
-  unsigned EltSize = Vec.getScalarValueSizeInBits() / 8;
-
-  Idx = DAG.getNode(ISD::MUL, dl, Idx.getValueType(), Idx,
-                    DAG.getConstant(EltSize, SDLoc(Vec), Idx.getValueType()));
-  Idx = DAG.getZExtOrTrunc(Idx, dl, TLI.getPointerTy(DAG.getDataLayout()));
-
-  SDValue SubStackPtr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx,
-                                    StackPtr);
+  SDValue SubStackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
 
   // Store the subvector.
   Ch = DAG.getStore(Ch, dl, Part, SubStackPtr, MachinePointerInfo());
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 6b62f11f1240..dc436ce04514 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -57,8 +57,6 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::BSWAP:       Res = PromoteIntRes_BSWAP(N); break;
   case ISD::BUILD_PAIR:  Res = PromoteIntRes_BUILD_PAIR(N); break;
   case ISD::Constant:    Res = PromoteIntRes_Constant(N); break;
-  case ISD::CONVERT_RNDSAT:
-                         Res = PromoteIntRes_CONVERT_RNDSAT(N); break;
   case ISD::CTLZ_ZERO_UNDEF:
   case ISD::CTLZ:        Res = PromoteIntRes_CTLZ(N); break;
   case ISD::CTPOP:       Res = PromoteIntRes_CTPOP(N); break;
@@ -354,18 +352,6 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Constant(SDNode *N) {
   return Result;
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_CONVERT_RNDSAT(SDNode *N) {
-  ISD::CvtCode CvtCode = cast<CvtRndSatSDNode>(N)->getCvtCode();
-  assert ((CvtCode == ISD::CVT_SS || CvtCode == ISD::CVT_SU ||
-           CvtCode == ISD::CVT_US || CvtCode == ISD::CVT_UU ||
-           CvtCode == ISD::CVT_SF || CvtCode == ISD::CVT_UF) &&
-          "can only promote integers");
-  EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  return DAG.getConvertRndSat(OutVT, SDLoc(N), N->getOperand(0),
-                              N->getOperand(1), N->getOperand(2),
-                              N->getOperand(3), N->getOperand(4), CvtCode);
-}
-
 SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
   // Zero extend to the promoted type and do the count there.
   SDValue Op = ZExtPromotedInteger(N->getOperand(0));
@@ -512,7 +498,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MGATHER(MaskedGatherSDNode *N) {
                    N->getIndex()};
   SDValue Res = DAG.getMaskedGather(DAG.getVTList(NVT, MVT::Other),
                                     N->getMemoryVT(), dl, Ops,
-                                    N->getMemOperand()); 
+                                    N->getMemOperand());
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
@@ -887,8 +873,6 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::BUILD_VECTOR: Res = PromoteIntOp_BUILD_VECTOR(N); break;
   case ISD::CONCAT_VECTORS: Res = PromoteIntOp_CONCAT_VECTORS(N); break;
   case ISD::EXTRACT_VECTOR_ELT: Res = PromoteIntOp_EXTRACT_VECTOR_ELT(N); break;
-  case ISD::CONVERT_RNDSAT:
-                          Res = PromoteIntOp_CONVERT_RNDSAT(N); break;
   case ISD::INSERT_VECTOR_ELT:
                           Res = PromoteIntOp_INSERT_VECTOR_ELT(N, OpNo);break;
   case ISD::SCALAR_TO_VECTOR:
@@ -1068,18 +1052,6 @@ SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_VECTOR(SDNode *N) {
   return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
 }
 
-SDValue DAGTypeLegalizer::PromoteIntOp_CONVERT_RNDSAT(SDNode *N) {
-  ISD::CvtCode CvtCode = cast<CvtRndSatSDNode>(N)->getCvtCode();
-  assert ((CvtCode == ISD::CVT_SS || CvtCode == ISD::CVT_SU ||
-           CvtCode == ISD::CVT_US || CvtCode == ISD::CVT_UU ||
-           CvtCode == ISD::CVT_FS || CvtCode == ISD::CVT_FU) &&
-           "can only promote integer arguments");
-  SDValue InOp = GetPromotedInteger(N->getOperand(0));
-  return DAG.getConvertRndSat(N->getValueType(0), SDLoc(N), InOp,
-                              N->getOperand(1), N->getOperand(2),
-                              N->getOperand(3), N->getOperand(4), CvtCode);
-}
-
 SDValue DAGTypeLegalizer::PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N,
                                                          unsigned OpNo) {
   if (OpNo == 1) {
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 693f5e2120a7..cf19d75676cd 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -1021,22 +1021,6 @@ void DAGTypeLegalizer::GetPairElements(SDValue Pair,
                    DAG.getIntPtrConstant(1, dl));
 }
 
-SDValue DAGTypeLegalizer::GetVectorElementPointer(SDValue VecPtr, EVT EltVT,
-                                                  SDValue Index) {
-  SDLoc dl(Index);
-  // Make sure the index type is big enough to compute in.
-  Index = DAG.getZExtOrTrunc(Index, dl, TLI.getPointerTy(DAG.getDataLayout()));
-
-  // Calculate the element offset and add it to the pointer.
-  unsigned EltSize = EltVT.getSizeInBits() / 8; // FIXME: should be ABI size.
-  assert(EltSize * 8 == EltVT.getSizeInBits() &&
-         "Converting bits to bytes lost precision");
-
-  Index = DAG.getNode(ISD::MUL, dl, Index.getValueType(), Index,
-                      DAG.getConstant(EltSize, dl, Index.getValueType()));
-  return DAG.getNode(ISD::ADD, dl, Index.getValueType(), Index, VecPtr);
-}
-
 /// Build an integer with low bits Lo and high bits Hi.
 SDValue DAGTypeLegalizer::JoinIntegers(SDValue Lo, SDValue Hi) {
   // Arbitrarily use dlHi for result SDLoc
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index d1022af69477..ec55662d75c0 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -173,7 +173,6 @@ private:
   /// input operand is returned.
   SDValue DisintegrateMERGE_VALUES(SDNode *N, unsigned ResNo);
 
-  SDValue GetVectorElementPointer(SDValue VecPtr, EVT EltVT, SDValue Index);
   SDValue JoinIntegers(SDValue Lo, SDValue Hi);
   SDValue LibCallify(RTLIB::Libcall LC, SDNode *N, bool isSigned);
 
@@ -250,7 +249,6 @@ private:
   SDValue PromoteIntRes_BITREVERSE(SDNode *N);
   SDValue PromoteIntRes_BUILD_PAIR(SDNode *N);
   SDValue PromoteIntRes_Constant(SDNode *N);
-  SDValue PromoteIntRes_CONVERT_RNDSAT(SDNode *N);
   SDValue PromoteIntRes_CTLZ(SDNode *N);
   SDValue PromoteIntRes_CTPOP(SDNode *N);
   SDValue PromoteIntRes_CTTZ(SDNode *N);
@@ -289,7 +287,6 @@ private:
   SDValue PromoteIntOp_BR_CC(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_BRCOND(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_BUILD_VECTOR(SDNode *N);
-  SDValue PromoteIntOp_CONVERT_RNDSAT(SDNode *N);
   SDValue PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue PromoteIntOp_EXTRACT_SUBVECTOR(SDNode *N);
@@ -603,7 +600,6 @@ private:
 
   SDValue ScalarizeVecRes_BITCAST(SDNode *N);
   SDValue ScalarizeVecRes_BUILD_VECTOR(SDNode *N);
-  SDValue ScalarizeVecRes_CONVERT_RNDSAT(SDNode *N);
   SDValue ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue ScalarizeVecRes_FP_ROUND(SDNode *N);
   SDValue ScalarizeVecRes_FPOWI(SDNode *N);
@@ -709,7 +705,6 @@ private:
   SDValue WidenVecRes_BITCAST(SDNode* N);
   SDValue WidenVecRes_BUILD_VECTOR(SDNode* N);
   SDValue WidenVecRes_CONCAT_VECTORS(SDNode* N);
-  SDValue WidenVecRes_CONVERT_RNDSAT(SDNode* N);
   SDValue WidenVecRes_EXTEND_VECTOR_INREG(SDNode* N);
   SDValue WidenVecRes_EXTRACT_SUBVECTOR(SDNode* N);
   SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 57c179ac15b8..27a9ac337f25 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -51,7 +51,6 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::MERGE_VALUES:      R = ScalarizeVecRes_MERGE_VALUES(N, ResNo);break;
   case ISD::BITCAST:           R = ScalarizeVecRes_BITCAST(N); break;
   case ISD::BUILD_VECTOR:      R = ScalarizeVecRes_BUILD_VECTOR(N); break;
-  case ISD::CONVERT_RNDSAT:    R = ScalarizeVecRes_CONVERT_RNDSAT(N); break;
   case ISD::EXTRACT_SUBVECTOR: R = ScalarizeVecRes_EXTRACT_SUBVECTOR(N); break;
   case ISD::FP_ROUND:          R = ScalarizeVecRes_FP_ROUND(N); break;
   case ISD::FP_ROUND_INREG:    R = ScalarizeVecRes_InregOp(N); break;
@@ -179,17 +178,6 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_BUILD_VECTOR(SDNode *N) {
   return InOp;
 }
 
-SDValue DAGTypeLegalizer::ScalarizeVecRes_CONVERT_RNDSAT(SDNode *N) {
-  EVT NewVT = N->getValueType(0).getVectorElementType();
-  SDValue Op0 = GetScalarizedVector(N->getOperand(0));
-  return DAG.getConvertRndSat(NewVT, SDLoc(N),
-                              Op0, DAG.getValueType(NewVT),
-                              DAG.getValueType(Op0.getValueType()),
-                              N->getOperand(3),
-                              N->getOperand(4),
-                              cast<CvtRndSatSDNode>(N)->getCvtCode());
-}
-
 SDValue DAGTypeLegalizer::ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
                      N->getValueType(0).getVectorElementType(),
@@ -621,7 +609,6 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
 
   case ISD::BITREVERSE:
   case ISD::BSWAP:
-  case ISD::CONVERT_RNDSAT:
   case ISD::CTLZ:
   case ISD::CTTZ:
   case ISD::CTLZ_ZERO_UNDEF:
@@ -846,7 +833,6 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
   GetSplitVector(Vec, Lo, Hi);
 
   EVT VecVT = Vec.getValueType();
-  EVT VecElemVT = VecVT.getVectorElementType();
   unsigned VecElems = VecVT.getVectorNumElements();
   unsigned SubElems = SubVec.getValueType().getVectorNumElements();
 
@@ -872,7 +858,7 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
       DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, MachinePointerInfo());
 
   // Store the new subvector into the specified index.
-  SDValue SubVecPtr = GetVectorElementPointer(StackPtr, VecElemVT, Idx);
+  SDValue SubVecPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
   Type *VecType = VecVT.getTypeForEVT(*DAG.getContext());
   unsigned Alignment = DAG.getDataLayout().getPrefTypeAlignment(VecType);
   Store = DAG.getStore(Store, dl, SubVec, SubVecPtr, MachinePointerInfo());
@@ -1003,7 +989,7 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
 
   // Store the new element.  This may be larger than the vector element type,
   // so use a truncating store.
-  SDValue EltPtr = GetVectorElementPointer(StackPtr, EltVT, Idx);
+  SDValue EltPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
   Type *VecType = VecVT.getTypeForEVT(*DAG.getContext());
   unsigned Alignment = DAG.getDataLayout().getPrefTypeAlignment(VecType);
   Store =
@@ -1236,18 +1222,6 @@ void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo,
   if (N->getOpcode() == ISD::FP_ROUND) {
     Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo, N->getOperand(1));
     Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi, N->getOperand(1));
-  } else if (N->getOpcode() == ISD::CONVERT_RNDSAT) {
-    SDValue DTyOpLo = DAG.getValueType(LoVT);
-    SDValue DTyOpHi = DAG.getValueType(HiVT);
-    SDValue STyOpLo = DAG.getValueType(Lo.getValueType());
-    SDValue STyOpHi = DAG.getValueType(Hi.getValueType());
-    SDValue RndOp = N->getOperand(3);
-    SDValue SatOp = N->getOperand(4);
-    ISD::CvtCode CvtCode = cast<CvtRndSatSDNode>(N)->getCvtCode();
-    Lo = DAG.getConvertRndSat(LoVT, dl, Lo, DTyOpLo, STyOpLo, RndOp, SatOp,
-                              CvtCode);
-    Hi = DAG.getConvertRndSat(HiVT, dl, Hi, DTyOpHi, STyOpHi, RndOp, SatOp,
-                              CvtCode);
   } else {
     Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo);
     Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi);
@@ -1650,7 +1624,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
       DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, MachinePointerInfo());
 
   // Load back the required element.
-  StackPtr = GetVectorElementPointer(StackPtr, EltVT, Idx);
+  StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
   return DAG.getExtLoad(ISD::EXTLOAD, dl, N->getValueType(0), Store, StackPtr,
                         MachinePointerInfo(), EltVT);
 }
@@ -2045,7 +2019,6 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::BITCAST:           Res = WidenVecRes_BITCAST(N); break;
   case ISD::BUILD_VECTOR:      Res = WidenVecRes_BUILD_VECTOR(N); break;
   case ISD::CONCAT_VECTORS:    Res = WidenVecRes_CONCAT_VECTORS(N); break;
-  case ISD::CONVERT_RNDSAT:    Res = WidenVecRes_CONVERT_RNDSAT(N); break;
   case ISD::EXTRACT_SUBVECTOR: Res = WidenVecRes_EXTRACT_SUBVECTOR(N); break;
   case ISD::FP_ROUND_INREG:    Res = WidenVecRes_InregOp(N); break;
   case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break;
@@ -2693,86 +2666,6 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
   return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops);
 }
 
-SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) {
-  SDLoc dl(N);
-  SDValue InOp  = N->getOperand(0);
-  SDValue RndOp = N->getOperand(3);
-  SDValue SatOp = N->getOperand(4);
-
-  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  unsigned WidenNumElts = WidenVT.getVectorNumElements();
-
-  EVT InVT = InOp.getValueType();
-  EVT InEltVT = InVT.getVectorElementType();
-  EVT InWidenVT = EVT::getVectorVT(*DAG.getContext(), InEltVT, WidenNumElts);
-
-  SDValue DTyOp = DAG.getValueType(WidenVT);
-  SDValue STyOp = DAG.getValueType(InWidenVT);
-  ISD::CvtCode CvtCode = cast<CvtRndSatSDNode>(N)->getCvtCode();
-
-  unsigned InVTNumElts = InVT.getVectorNumElements();
-  if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
-    InOp = GetWidenedVector(InOp);
-    InVT = InOp.getValueType();
-    InVTNumElts = InVT.getVectorNumElements();
-    if (InVTNumElts == WidenNumElts)
-      return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp,
-                                  SatOp, CvtCode);
-  }
-
-  if (TLI.isTypeLegal(InWidenVT)) {
-    // Because the result and the input are different vector types, widening
-    // the result could create a legal type but widening the input might make
-    // it an illegal type that might lead to repeatedly splitting the input
-    // and then widening it. To avoid this, we widen the input only if
-    // it results in a legal type.
-    if (WidenNumElts % InVTNumElts == 0) {
-      // Widen the input and call convert on the widened input vector.
-      unsigned NumConcat = WidenNumElts/InVTNumElts;
-      SmallVector<SDValue, 16> Ops(NumConcat);
-      Ops[0] = InOp;
-      SDValue UndefVal = DAG.getUNDEF(InVT);
-      for (unsigned i = 1; i != NumConcat; ++i)
-        Ops[i] = UndefVal;
-
-      InOp = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWidenVT, Ops);
-      return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp,
-                                  SatOp, CvtCode);
-    }
-
-    if (InVTNumElts % WidenNumElts == 0) {
-      // Extract the input and convert the shorten input vector.
-      InOp = DAG.getNode(
-          ISD::EXTRACT_SUBVECTOR, dl, InWidenVT, InOp,
-          DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
-      return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp,
-                                  SatOp, CvtCode);
-    }
-  }
-
-  // Otherwise unroll into some nasty scalar code and rebuild the vector.
-  SmallVector<SDValue, 16> Ops(WidenNumElts);
-  EVT EltVT = WidenVT.getVectorElementType();
-  DTyOp = DAG.getValueType(EltVT);
-  STyOp = DAG.getValueType(InEltVT);
-
-  unsigned MinElts = std::min(InVTNumElts, WidenNumElts);
-  unsigned i;
-  for (i=0; i < MinElts; ++i) {
-    SDValue ExtVal = DAG.getNode(
-        ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
-        DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
-    Ops[i] = DAG.getConvertRndSat(WidenVT, dl, ExtVal, DTyOp, STyOp, RndOp,
-                                  SatOp, CvtCode);
-  }
-
-  SDValue UndefVal = DAG.getUNDEF(EltVT);
-  for (; i < WidenNumElts; ++i)
-    Ops[i] = UndefVal;
-
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops);
-}
-
 SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
   EVT      VT = N->getValueType(0);
   EVT      WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
index 5cc806668b12..a058942c5689 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
@@ -15,10 +15,20 @@
 #ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_SCHEDULEDAGSDNODES_H
 #define LLVM_LIB_CODEGEN_SELECTIONDAG_SCHEDULEDAGSDNODES_H
 
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/Support/Casting.h"
+#include <cassert>
+#include <string>
+#include <vector>
 
 namespace llvm {
+
+class InstrItineraryData;
+
   /// ScheduleDAGSDNodes - A ScheduleDAG for scheduling SDNode-based DAGs.
   ///
   /// Edges between SUnits are initially based on edges in the SelectionDAG,
@@ -44,7 +54,7 @@ namespace llvm {
 
     explicit ScheduleDAGSDNodes(MachineFunction &mf);
 
-    ~ScheduleDAGSDNodes() override {}
+    ~ScheduleDAGSDNodes() override = default;
 
     /// Run - perform scheduling.
     ///
@@ -131,6 +141,7 @@ namespace llvm {
       unsigned DefIdx;
       unsigned NodeNumDefs;
       MVT ValueType;
+
     public:
       RegDefIter(const SUnit *SU, const ScheduleDAGSDNodes *SD);
 
@@ -150,6 +161,7 @@ namespace llvm {
       }
 
       void Advance();
+
     private:
       void InitNodeNumDefs();
     };
@@ -175,6 +187,7 @@ namespace llvm {
     void EmitPhysRegCopy(SUnit *SU, DenseMap<SUnit*, unsigned> &VRBaseMap,
                          MachineBasicBlock::iterator InsertPos);
   };
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_SELECTIONDAG_SCHEDULEDAGSDNODES_H
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index b970dc0e5f5f..e225ba8703b7 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1104,7 +1104,7 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, const SDLoc &DL,
   if (VT.isVector() && TLI->getTypeAction(*getContext(), EltVT) ==
       TargetLowering::TypePromoteInteger) {
    EltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
-   APInt NewVal = Elt->getValue().zext(EltVT.getSizeInBits());
+   APInt NewVal = Elt->getValue().zextOrTrunc(EltVT.getSizeInBits());
    Elt = ConstantInt::get(*getContext(), NewVal);
   }
   // In other cases the element type is illegal and needs to be expanded, for
@@ -1130,7 +1130,7 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, const SDLoc &DL,
     SmallVector<SDValue, 2> EltParts;
     for (unsigned i = 0; i < ViaVecNumElts / VT.getVectorNumElements(); ++i) {
       EltParts.push_back(getConstant(NewVal.lshr(i * ViaEltSizeInBits)
-                                           .trunc(ViaEltSizeInBits), DL,
+                                           .zextOrTrunc(ViaEltSizeInBits), DL,
                                      ViaEltVT, isT, isO));
     }
 
@@ -1629,31 +1629,6 @@ SDValue SelectionDAG::getCommutedVectorShuffle(const ShuffleVectorSDNode &SV) {
   return getVectorShuffle(VT, SDLoc(&SV), Op1, Op0, MaskVec);
 }
 
-SDValue SelectionDAG::getConvertRndSat(EVT VT, const SDLoc &dl, SDValue Val,
-                                       SDValue DTy, SDValue STy, SDValue Rnd,
-                                       SDValue Sat, ISD::CvtCode Code) {
-  // If the src and dest types are the same and the conversion is between
-  // integer types of the same sign or two floats, no conversion is necessary.
-  if (DTy == STy &&
-      (Code == ISD::CVT_UU || Code == ISD::CVT_SS || Code == ISD::CVT_FF))
-    return Val;
-
-  FoldingSetNodeID ID;
-  SDValue Ops[] = { Val, DTy, STy, Rnd, Sat };
-  AddNodeIDNode(ID, ISD::CONVERT_RNDSAT, getVTList(VT), Ops);
-  void* IP = nullptr;
-  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
-    return SDValue(E, 0);
-
-  auto *N =
-      newSDNode<CvtRndSatSDNode>(VT, dl.getIROrder(), dl.getDebugLoc(), Code);
-  createOperands(N, Ops);
-
-  CSEMap.InsertNode(N, IP);
-  InsertNode(N);
-  return SDValue(N, 0);
-}
-
 SDValue SelectionDAG::getRegister(unsigned RegNo, EVT VT) {
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::Register, getVTList(VT), None);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index a07bd8f83546..9ca646534e2b 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5211,39 +5211,6 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     setValue(&I, Res);
     return nullptr;
   }
-  case Intrinsic::convertff:
-  case Intrinsic::convertfsi:
-  case Intrinsic::convertfui:
-  case Intrinsic::convertsif:
-  case Intrinsic::convertuif:
-  case Intrinsic::convertss:
-  case Intrinsic::convertsu:
-  case Intrinsic::convertus:
-  case Intrinsic::convertuu: {
-    ISD::CvtCode Code = ISD::CVT_INVALID;
-    switch (Intrinsic) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::convertff:  Code = ISD::CVT_FF; break;
-    case Intrinsic::convertfsi: Code = ISD::CVT_FS; break;
-    case Intrinsic::convertfui: Code = ISD::CVT_FU; break;
-    case Intrinsic::convertsif: Code = ISD::CVT_SF; break;
-    case Intrinsic::convertuif: Code = ISD::CVT_UF; break;
-    case Intrinsic::convertss:  Code = ISD::CVT_SS; break;
-    case Intrinsic::convertsu:  Code = ISD::CVT_SU; break;
-    case Intrinsic::convertus:  Code = ISD::CVT_US; break;
-    case Intrinsic::convertuu:  Code = ISD::CVT_UU; break;
-    }
-    EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
-    const Value *Op1 = I.getArgOperand(0);
-    Res = DAG.getConvertRndSat(DestVT, sdl, getValue(Op1),
-                               DAG.getValueType(DestVT),
-                               DAG.getValueType(getValue(Op1).getValueType()),
-                               getValue(I.getArgOperand(1)),
-                               getValue(I.getArgOperand(2)),
-                               Code);
-    setValue(&I, Res);
-    return nullptr;
-  }
   case Intrinsic::powi:
     setValue(&I, ExpandPowI(sdl, getValue(I.getArgOperand(0)),
                             getValue(I.getArgOperand(1)), DAG));
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 340088a5fc96..0faaad8a21b7 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -262,21 +262,6 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FP16_TO_FP:                 return "fp16_to_fp";
   case ISD::FP_TO_FP16:                 return "fp_to_fp16";
 
-  case ISD::CONVERT_RNDSAT: {
-    switch (cast<CvtRndSatSDNode>(this)->getCvtCode()) {
-    default: llvm_unreachable("Unknown cvt code!");
-    case ISD::CVT_FF:                   return "cvt_ff";
-    case ISD::CVT_FS:                   return "cvt_fs";
-    case ISD::CVT_FU:                   return "cvt_fu";
-    case ISD::CVT_SF:                   return "cvt_sf";
-    case ISD::CVT_UF:                   return "cvt_uf";
-    case ISD::CVT_SS:                   return "cvt_ss";
-    case ISD::CVT_SU:                   return "cvt_su";
-    case ISD::CVT_US:                   return "cvt_us";
-    case ISD::CVT_UU:                   return "cvt_uu";
-    }
-  }
-
     // Control flow instructions
   case ISD::BR:                         return "br";
   case ISD::BRIND:                      return "brind";
@@ -322,7 +307,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::CTTZ_ZERO_UNDEF:            return "cttz_zero_undef";
   case ISD::CTLZ:                       return "ctlz";
   case ISD::CTLZ_ZERO_UNDEF:            return "ctlz_zero_undef";
-    
+
   // Trampolines
   case ISD::INIT_TRAMPOLINE:            return "init_trampoline";
   case ISD::ADJUST_TRAMPOLINE:          return "adjust_trampoline";
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 591a37d600cc..690f0d2c8082 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -3706,7 +3706,7 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
   return Result;
 }
 
-SDValue 
+SDValue
 TargetLowering::IncrementMemoryAddress(SDValue Addr, SDValue Mask,
                                        const SDLoc &DL, EVT DataVT,
                                        SelectionDAG &DAG,
@@ -3738,6 +3738,49 @@ TargetLowering::IncrementMemoryAddress(SDValue Addr, SDValue Mask,
   return DAG.getNode(ISD::ADD, DL, AddrVT, Addr, Increment);
 }
 
+static SDValue clampDynamicVectorIndex(SelectionDAG &DAG,
+                                       SDValue Idx,
+                                       EVT VecVT,
+                                       const SDLoc &dl) {
+  if (isa<ConstantSDNode>(Idx))
+    return Idx;
+
+  EVT IdxVT = Idx.getValueType();
+  unsigned NElts = VecVT.getVectorNumElements();
+  if (isPowerOf2_32(NElts)) {
+    APInt Imm = APInt::getLowBitsSet(IdxVT.getSizeInBits(),
+                                     Log2_32(NElts));
+    return DAG.getNode(ISD::AND, dl, IdxVT, Idx,
+                       DAG.getConstant(Imm, dl, IdxVT));
+  }
+
+  return DAG.getNode(ISD::UMIN, dl, IdxVT, Idx,
+                     DAG.getConstant(NElts - 1, dl, IdxVT));
+}
+
+SDValue TargetLowering::getVectorElementPointer(SelectionDAG &DAG,
+                                                SDValue VecPtr, EVT VecVT,
+                                                SDValue Index) const {
+  SDLoc dl(Index);
+  // Make sure the index type is big enough to compute in.
+  Index = DAG.getZExtOrTrunc(Index, dl, getPointerTy(DAG.getDataLayout()));
+
+  EVT EltVT = VecVT.getVectorElementType();
+
+  // Calculate the element offset and add it to the pointer.
+  unsigned EltSize = EltVT.getSizeInBits() / 8; // FIXME: should be ABI size.
+  assert(EltSize * 8 == EltVT.getSizeInBits() &&
+         "Converting bits to bytes lost precision");
+
+  Index = clampDynamicVectorIndex(DAG, Index, VecVT, dl);
+
+  EVT IdxVT = Index.getValueType();
+
+  Index = DAG.getNode(ISD::MUL, dl, IdxVT, Index,
+                      DAG.getConstant(EltSize, dl, IdxVT));
+  return DAG.getNode(ISD::ADD, dl, IdxVT, Index, VecPtr);
+}
+
 //===----------------------------------------------------------------------===//
 // Implementation of Emulated TLS Model
 //===----------------------------------------------------------------------===//
diff --git a/lib/DebugInfo/CodeView/CMakeLists.txt b/lib/DebugInfo/CodeView/CMakeLists.txt
index 221a8969965d..f9bff86b41c8 100644
--- a/lib/DebugInfo/CodeView/CMakeLists.txt
+++ b/lib/DebugInfo/CodeView/CMakeLists.txt
@@ -2,6 +2,7 @@ add_llvm_library(LLVMDebugInfoCodeView
   CodeViewError.cpp
   CodeViewRecordIO.cpp
   CVSymbolVisitor.cpp
+  CVTypeDumper.cpp
   CVTypeVisitor.cpp
   EnumTables.cpp
   Line.cpp
@@ -10,7 +11,9 @@ add_llvm_library(LLVMDebugInfoCodeView
   RecordSerialization.cpp
   SymbolRecordMapping.cpp
   SymbolDumper.cpp
-  TypeDumper.cpp
+  TypeDatabase.cpp
+  TypeDatabaseVisitor.cpp
+  TypeDumpVisitor.cpp
   TypeRecord.cpp
   TypeRecordMapping.cpp
   TypeSerializer.cpp
diff --git a/lib/DebugInfo/CodeView/CVTypeDumper.cpp b/lib/DebugInfo/CodeView/CVTypeDumper.cpp
new file mode 100644
index 000000000000..fcd239cce0dd
--- /dev/null
+++ b/lib/DebugInfo/CodeView/CVTypeDumper.cpp
@@ -0,0 +1,73 @@
+//===-- CVTypeDumper.cpp - CodeView type info dumper ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/CVTypeDumper.h"
+#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
+#include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
+#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
+#include "llvm/DebugInfo/MSF/ByteStream.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+Error CVTypeDumper::dump(const CVType &Record, TypeVisitorCallbacks &Dumper) {
+  TypeDatabaseVisitor DBV(TypeDB);
+  TypeDeserializer Deserializer;
+  TypeVisitorCallbackPipeline Pipeline;
+  Pipeline.addCallbackToPipeline(Deserializer);
+  Pipeline.addCallbackToPipeline(DBV);
+  Pipeline.addCallbackToPipeline(Dumper);
+
+  CVTypeVisitor Visitor(Pipeline);
+
+  CVType RecordCopy = Record;
+  if (auto EC = Visitor.visitTypeRecord(RecordCopy))
+    return EC;
+  return Error::success();
+}
+
+Error CVTypeDumper::dump(const CVTypeArray &Types,
+                         TypeVisitorCallbacks &Dumper) {
+  TypeDatabaseVisitor DBV(TypeDB);
+  TypeDeserializer Deserializer;
+  TypeVisitorCallbackPipeline Pipeline;
+  Pipeline.addCallbackToPipeline(Deserializer);
+  Pipeline.addCallbackToPipeline(DBV);
+  Pipeline.addCallbackToPipeline(Dumper);
+
+  CVTypeVisitor Visitor(Pipeline);
+
+  if (auto EC = Visitor.visitTypeStream(Types))
+    return EC;
+  return Error::success();
+}
+
+Error CVTypeDumper::dump(ArrayRef<uint8_t> Data, TypeVisitorCallbacks &Dumper) {
+  msf::ByteStream Stream(Data);
+  CVTypeArray Types;
+  msf::StreamReader Reader(Stream);
+  if (auto EC = Reader.readArray(Types, Reader.getLength()))
+    return EC;
+
+  return dump(Types, Dumper);
+}
+
+void CVTypeDumper::printTypeIndex(ScopedPrinter &Printer, StringRef FieldName,
+                                  TypeIndex TI, TypeDatabase &DB) {
+  StringRef TypeName;
+  if (!TI.isNoneType())
+    TypeName = DB.getTypeName(TI);
+  if (!TypeName.empty())
+    Printer.printHex(FieldName, TypeName, TI.getIndex());
+  else
+    Printer.printHex(FieldName, TI.getIndex());
+}
diff --git a/lib/DebugInfo/CodeView/SymbolDumper.cpp b/lib/DebugInfo/CodeView/SymbolDumper.cpp
index 326e1f5add65..fd54fba13c76 100644
--- a/lib/DebugInfo/CodeView/SymbolDumper.cpp
+++ b/lib/DebugInfo/CodeView/SymbolDumper.cpp
@@ -11,13 +11,13 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/DebugInfo/CodeView/CVSymbolVisitor.h"
+#include "llvm/DebugInfo/CodeView/CVTypeDumper.h"
 #include "llvm/DebugInfo/CodeView/EnumTables.h"
 #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
 #include "llvm/DebugInfo/CodeView/SymbolDumpDelegate.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/CodeView/SymbolVisitorCallbackPipeline.h"
 #include "llvm/DebugInfo/CodeView/SymbolVisitorCallbacks.h"
-#include "llvm/DebugInfo/CodeView/TypeDumper.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ScopedPrinter.h"
@@ -32,9 +32,9 @@ namespace {
 /// the visitor out of SymbolDumper.h.
 class CVSymbolDumperImpl : public SymbolVisitorCallbacks {
 public:
-  CVSymbolDumperImpl(CVTypeDumper &CVTD, SymbolDumpDelegate *ObjDelegate,
+  CVSymbolDumperImpl(TypeDatabase &TypeDB, SymbolDumpDelegate *ObjDelegate,
                      ScopedPrinter &W, bool PrintRecordBytes)
-      : CVTD(CVTD), ObjDelegate(ObjDelegate), W(W),
+      : TypeDB(TypeDB), ObjDelegate(ObjDelegate), W(W),
         PrintRecordBytes(PrintRecordBytes), InFunctionScope(false) {}
 
 /// CVSymbolVisitor overrides.
@@ -51,8 +51,9 @@ private:
   void printLocalVariableAddrRange(const LocalVariableAddrRange &Range,
                                    uint32_t RelocationOffset);
   void printLocalVariableAddrGap(ArrayRef<LocalVariableAddrGap> Gaps);
+  void printTypeIndex(StringRef FieldName, TypeIndex TI);
 
-  CVTypeDumper &CVTD;
+  TypeDatabase &TypeDB;
   SymbolDumpDelegate *ObjDelegate;
   ScopedPrinter &W;
 
@@ -80,6 +81,10 @@ void CVSymbolDumperImpl::printLocalVariableAddrGap(
   }
 }
 
+void CVSymbolDumperImpl::printTypeIndex(StringRef FieldName, TypeIndex TI) {
+  CVTypeDumper::printTypeIndex(W, FieldName, TI, TypeDB);
+}
+
 Error CVSymbolDumperImpl::visitSymbolBegin(CVSymbol &CVR) {
   return Error::success();
 }
@@ -163,7 +168,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
   DictScope S(W, "BPRelativeSym");
 
   W.printNumber("Offset", BPRel.Offset);
-  CVTD.printTypeIndex("Type", BPRel.Type);
+  printTypeIndex("Type", BPRel.Type);
   W.printString("VarName", BPRel.Name);
   return Error::success();
 }
@@ -187,7 +192,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
                                      CallSiteInfo.CodeOffset, &LinkageName);
   }
   W.printHex("Segment", CallSiteInfo.Segment);
-  CVTD.printTypeIndex("Type", CallSiteInfo.Type);
+  printTypeIndex("Type", CallSiteInfo.Type);
   if (!LinkageName.empty())
     W.printString("LinkageName", LinkageName);
   return Error::success();
@@ -278,7 +283,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
                                            ConstantSym &Constant) {
   DictScope S(W, "Constant");
 
-  CVTD.printTypeIndex("Type", Constant.Type);
+  printTypeIndex("Type", Constant.Type);
   W.printNumber("Value", Constant.Value);
   W.printString("Name", Constant.Name);
   return Error::success();
@@ -293,7 +298,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, DataSym &Data) {
     ObjDelegate->printRelocatedField("DataOffset", Data.getRelocationOffset(),
                                      Data.DataOffset, &LinkageName);
   }
-  CVTD.printTypeIndex("Type", Data.Type);
+  printTypeIndex("Type", Data.Type);
   W.printString("DisplayName", Data.Name);
   if (!LinkageName.empty())
     W.printString("LinkageName", LinkageName);
@@ -445,7 +450,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(
   }
   W.printHex("Segment", HeapAllocSite.Segment);
   W.printHex("CallInstructionSize", HeapAllocSite.CallInstructionSize);
-  CVTD.printTypeIndex("Type", HeapAllocSite.Type);
+  printTypeIndex("Type", HeapAllocSite.Type);
   if (!LinkageName.empty())
     W.printString("LinkageName", LinkageName);
   return Error::success();
@@ -457,7 +462,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
 
   W.printHex("PtrParent", InlineSite.Parent);
   W.printHex("PtrEnd", InlineSite.End);
-  CVTD.printTypeIndex("Inlinee", InlineSite.Inlinee);
+  printTypeIndex("Inlinee", InlineSite.Inlinee);
 
   ListScope BinaryAnnotations(W, "BinaryAnnotations");
   for (auto &Annotation : InlineSite.annotations()) {
@@ -555,7 +560,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, LabelSym &Label) {
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, LocalSym &Local) {
   DictScope S(W, "Local");
 
-  CVTD.printTypeIndex("Type", Local.Type);
+  printTypeIndex("Type", Local.Type);
   W.printFlags("Flags", uint16_t(Local.Flags), getLocalFlagNames());
   W.printString("VarName", Local.Name);
   return Error::success();
@@ -586,7 +591,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, ProcSym &Proc) {
   W.printHex("CodeSize", Proc.CodeSize);
   W.printHex("DbgStart", Proc.DbgStart);
   W.printHex("DbgEnd", Proc.DbgEnd);
-  CVTD.printTypeIndex("FunctionType", Proc.FunctionType);
+  printTypeIndex("FunctionType", Proc.FunctionType);
   if (ObjDelegate) {
     ObjDelegate->printRelocatedField("CodeOffset", Proc.getRelocationOffset(),
                                      Proc.CodeOffset, &LinkageName);
@@ -616,7 +621,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, CallerSym &Caller) {
   ListScope S(W, CVR.kind() == S_CALLEES ? "Callees" : "Callers");
   for (auto FuncID : Caller.Indices)
-    CVTD.printTypeIndex("FuncID", FuncID);
+    printTypeIndex("FuncID", FuncID);
   return Error::success();
 }
 
@@ -625,7 +630,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
   DictScope S(W, "RegRelativeSym");
 
   W.printHex("Offset", RegRel.Offset);
-  CVTD.printTypeIndex("Type", RegRel.Type);
+  printTypeIndex("Type", RegRel.Type);
   W.printHex("Register", RegRel.Register);
   W.printString("VarName", RegRel.Name);
   return Error::success();
@@ -640,7 +645,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
     ObjDelegate->printRelocatedField("DataOffset", Data.getRelocationOffset(),
                                      Data.DataOffset, &LinkageName);
   }
-  CVTD.printTypeIndex("Type", Data.Type);
+  printTypeIndex("Type", Data.Type);
   W.printString("DisplayName", Data.Name);
   if (!LinkageName.empty())
     W.printString("LinkageName", LinkageName);
@@ -649,7 +654,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, UDTSym &UDT) {
   DictScope S(W, "UDT");
-  CVTD.printTypeIndex("Type", UDT.Type);
+  printTypeIndex("Type", UDT.Type);
   W.printString("UDTName", UDT.Name);
   return Error::success();
 }
@@ -664,7 +669,7 @@ Error CVSymbolDumperImpl::visitUnknownSymbol(CVSymbol &CVR) {
 Error CVSymbolDumper::dump(CVRecord<SymbolKind> &Record) {
   SymbolVisitorCallbackPipeline Pipeline;
   SymbolDeserializer Deserializer(ObjDelegate.get());
-  CVSymbolDumperImpl Dumper(CVTD, ObjDelegate.get(), W, PrintRecordBytes);
+  CVSymbolDumperImpl Dumper(TypeDB, ObjDelegate.get(), W, PrintRecordBytes);
 
   Pipeline.addCallbackToPipeline(Deserializer);
   Pipeline.addCallbackToPipeline(Dumper);
@@ -675,7 +680,7 @@ Error CVSymbolDumper::dump(CVRecord<SymbolKind> &Record) {
 Error CVSymbolDumper::dump(const CVSymbolArray &Symbols) {
   SymbolVisitorCallbackPipeline Pipeline;
   SymbolDeserializer Deserializer(ObjDelegate.get());
-  CVSymbolDumperImpl Dumper(CVTD, ObjDelegate.get(), W, PrintRecordBytes);
+  CVSymbolDumperImpl Dumper(TypeDB, ObjDelegate.get(), W, PrintRecordBytes);
 
   Pipeline.addCallbackToPipeline(Deserializer);
   Pipeline.addCallbackToPipeline(Dumper);
diff --git a/lib/DebugInfo/CodeView/TypeDatabase.cpp b/lib/DebugInfo/CodeView/TypeDatabase.cpp
new file mode 100644
index 000000000000..c7f72551dc8b
--- /dev/null
+++ b/lib/DebugInfo/CodeView/TypeDatabase.cpp
@@ -0,0 +1,114 @@
+//===- TypeDatabase.cpp --------------------------------------- *- C++ --*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+namespace {
+struct SimpleTypeEntry {
+  StringRef Name;
+  SimpleTypeKind Kind;
+};
+}
+
+/// The names here all end in "*". If the simple type is a pointer type, we
+/// return the whole name. Otherwise we lop off the last character in our
+/// StringRef.
+static const SimpleTypeEntry SimpleTypeNames[] = {
+    {"void*", SimpleTypeKind::Void},
+    {"<not translated>*", SimpleTypeKind::NotTranslated},
+    {"HRESULT*", SimpleTypeKind::HResult},
+    {"signed char*", SimpleTypeKind::SignedCharacter},
+    {"unsigned char*", SimpleTypeKind::UnsignedCharacter},
+    {"char*", SimpleTypeKind::NarrowCharacter},
+    {"wchar_t*", SimpleTypeKind::WideCharacter},
+    {"char16_t*", SimpleTypeKind::Character16},
+    {"char32_t*", SimpleTypeKind::Character32},
+    {"__int8*", SimpleTypeKind::SByte},
+    {"unsigned __int8*", SimpleTypeKind::Byte},
+    {"short*", SimpleTypeKind::Int16Short},
+    {"unsigned short*", SimpleTypeKind::UInt16Short},
+    {"__int16*", SimpleTypeKind::Int16},
+    {"unsigned __int16*", SimpleTypeKind::UInt16},
+    {"long*", SimpleTypeKind::Int32Long},
+    {"unsigned long*", SimpleTypeKind::UInt32Long},
+    {"int*", SimpleTypeKind::Int32},
+    {"unsigned*", SimpleTypeKind::UInt32},
+    {"__int64*", SimpleTypeKind::Int64Quad},
+    {"unsigned __int64*", SimpleTypeKind::UInt64Quad},
+    {"__int64*", SimpleTypeKind::Int64},
+    {"unsigned __int64*", SimpleTypeKind::UInt64},
+    {"__int128*", SimpleTypeKind::Int128},
+    {"unsigned __int128*", SimpleTypeKind::UInt128},
+    {"__half*", SimpleTypeKind::Float16},
+    {"float*", SimpleTypeKind::Float32},
+    {"float*", SimpleTypeKind::Float32PartialPrecision},
+    {"__float48*", SimpleTypeKind::Float48},
+    {"double*", SimpleTypeKind::Float64},
+    {"long double*", SimpleTypeKind::Float80},
+    {"__float128*", SimpleTypeKind::Float128},
+    {"_Complex float*", SimpleTypeKind::Complex32},
+    {"_Complex double*", SimpleTypeKind::Complex64},
+    {"_Complex long double*", SimpleTypeKind::Complex80},
+    {"_Complex __float128*", SimpleTypeKind::Complex128},
+    {"bool*", SimpleTypeKind::Boolean8},
+    {"__bool16*", SimpleTypeKind::Boolean16},
+    {"__bool32*", SimpleTypeKind::Boolean32},
+    {"__bool64*", SimpleTypeKind::Boolean64},
+};
+
+/// Gets the type index for the next type record.
+TypeIndex TypeDatabase::getNextTypeIndex() const {
+  return TypeIndex(TypeIndex::FirstNonSimpleIndex + CVUDTNames.size());
+}
+
+/// Records the name of a type, and reserves its type index.
+void TypeDatabase::recordType(StringRef Name, CVType Data) {
+  CVUDTNames.push_back(Name);
+  TypeRecords.push_back(Data);
+}
+
+/// Saves the name in a StringSet and creates a stable StringRef.
+StringRef TypeDatabase::saveTypeName(StringRef TypeName) {
+  return TypeNameStorage.save(TypeName);
+}
+
+StringRef TypeDatabase::getTypeName(TypeIndex Index) const {
+  if (Index.isNoneType())
+    return "<no type>";
+
+  if (Index.isSimple()) {
+    // This is a simple type.
+    for (const auto &SimpleTypeName : SimpleTypeNames) {
+      if (SimpleTypeName.Kind == Index.getSimpleKind()) {
+        if (Index.getSimpleMode() == SimpleTypeMode::Direct)
+          return SimpleTypeName.Name.drop_back(1);
+        // Otherwise, this is a pointer type. We gloss over the distinction
+        // between near, far, 64, 32, etc, and just give a pointer type.
+        return SimpleTypeName.Name;
+      }
+    }
+    return "<unknown simple type>";
+  }
+
+  uint32_t I = Index.getIndex() - TypeIndex::FirstNonSimpleIndex;
+  if (I < CVUDTNames.size())
+    return CVUDTNames[I];
+
+  return "<unknown UDT>";
+}
+
+bool TypeDatabase::containsTypeIndex(TypeIndex Index) const {
+  uint32_t I = Index.getIndex() - TypeIndex::FirstNonSimpleIndex;
+  return I < CVUDTNames.size();
+}
+
+uint32_t TypeDatabase::size() const { return CVUDTNames.size(); }
diff --git a/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp b/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp
new file mode 100644
index 000000000000..d9d563902182
--- /dev/null
+++ b/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp
@@ -0,0 +1,289 @@
+//===- TypeDatabaseVisitor.cpp -------------------------------- *- C++ --*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
+
+#include "llvm/ADT/SmallString.h"
+
+using namespace llvm;
+
+using namespace llvm::codeview;
+
+Error TypeDatabaseVisitor::visitTypeBegin(CVRecord<TypeLeafKind> &Record) {
+  assert(!IsInFieldList);
+  // Reset Name to the empty string. If the visitor sets it, we know it.
+  Name = "";
+
+  if (Record.Type == LF_FIELDLIST) {
+    // Record that we're in a field list so that members do not get assigned
+    // type indices.
+    IsInFieldList = true;
+  }
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitTypeEnd(CVType &CVR) {
+  if (CVR.Type == LF_FIELDLIST) {
+    assert(IsInFieldList);
+    IsInFieldList = false;
+  }
+  assert(!IsInFieldList);
+
+  // Record every type that is not a field list member, even if Name is empty.
+  // CVUDTNames is indexed by type index, and must have one entry for every
+  // type.  Field list members are not recorded, and are only referenced by
+  // their containing field list record.
+  TypeDB.recordType(Name, CVR);
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitMemberBegin(CVMemberRecord &Record) {
+  assert(IsInFieldList);
+  // Reset Name to the empty string. If the visitor sets it, we know it.
+  Name = "";
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitMemberEnd(CVMemberRecord &Record) {
+  assert(IsInFieldList);
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
+                                            FieldListRecord &FieldList) {
+  Name = "<field list>";
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
+                                            StringIdRecord &String) {
+  // Put this in the database so it gets printed with LF_UDT_SRC_LINE.
+  Name = String.getString();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ArgListRecord &Args) {
+  auto Indices = Args.getIndices();
+  uint32_t Size = Indices.size();
+  SmallString<256> TypeName("(");
+  for (uint32_t I = 0; I < Size; ++I) {
+    StringRef ArgTypeName = TypeDB.getTypeName(Indices[I]);
+    TypeName.append(ArgTypeName);
+    if (I + 1 != Size)
+      TypeName.append(", ");
+  }
+  TypeName.push_back(')');
+  Name = TypeDB.saveTypeName(TypeName);
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ClassRecord &Class) {
+  Name = Class.getName();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, UnionRecord &Union) {
+  Name = Union.getName();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, EnumRecord &Enum) {
+  Name = Enum.getName();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ArrayRecord &AT) {
+  Name = AT.getName();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, VFTableRecord &VFT) {
+  Name = VFT.getName();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
+                                            MemberFuncIdRecord &Id) {
+  Name = Id.getName();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
+                                            ProcedureRecord &Proc) {
+  StringRef ReturnTypeName = TypeDB.getTypeName(Proc.getReturnType());
+  StringRef ArgListTypeName = TypeDB.getTypeName(Proc.getArgumentList());
+  SmallString<256> TypeName(ReturnTypeName);
+  TypeName.push_back(' ');
+  TypeName.append(ArgListTypeName);
+  Name = TypeDB.saveTypeName(TypeName);
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
+                                            MemberFunctionRecord &MF) {
+  StringRef ReturnTypeName = TypeDB.getTypeName(MF.getReturnType());
+  StringRef ClassTypeName = TypeDB.getTypeName(MF.getClassType());
+  StringRef ArgListTypeName = TypeDB.getTypeName(MF.getArgumentList());
+  SmallString<256> TypeName(ReturnTypeName);
+  TypeName.push_back(' ');
+  TypeName.append(ClassTypeName);
+  TypeName.append("::");
+  TypeName.append(ArgListTypeName);
+  Name = TypeDB.saveTypeName(TypeName);
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, FuncIdRecord &Func) {
+  Name = Func.getName();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
+                                            TypeServer2Record &TS) {
+  Name = TS.getName();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
+
+  if (Ptr.isPointerToMember()) {
+    const MemberPointerInfo &MI = Ptr.getMemberInfo();
+
+    StringRef PointeeName = TypeDB.getTypeName(Ptr.getReferentType());
+    StringRef ClassName = TypeDB.getTypeName(MI.getContainingType());
+    SmallString<256> TypeName(PointeeName);
+    TypeName.push_back(' ');
+    TypeName.append(ClassName);
+    TypeName.append("::*");
+    Name = TypeDB.saveTypeName(TypeName);
+  } else {
+    SmallString<256> TypeName;
+    if (Ptr.isConst())
+      TypeName.append("const ");
+    if (Ptr.isVolatile())
+      TypeName.append("volatile ");
+    if (Ptr.isUnaligned())
+      TypeName.append("__unaligned ");
+
+    TypeName.append(TypeDB.getTypeName(Ptr.getReferentType()));
+
+    if (Ptr.getMode() == PointerMode::LValueReference)
+      TypeName.append("&");
+    else if (Ptr.getMode() == PointerMode::RValueReference)
+      TypeName.append("&&");
+    else if (Ptr.getMode() == PointerMode::Pointer)
+      TypeName.append("*");
+
+    if (!TypeName.empty())
+      Name = TypeDB.saveTypeName(TypeName);
+  }
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ModifierRecord &Mod) {
+  uint16_t Mods = static_cast<uint16_t>(Mod.getModifiers());
+
+  StringRef ModifiedName = TypeDB.getTypeName(Mod.getModifiedType());
+  SmallString<256> TypeName;
+  if (Mods & uint16_t(ModifierOptions::Const))
+    TypeName.append("const ");
+  if (Mods & uint16_t(ModifierOptions::Volatile))
+    TypeName.append("volatile ");
+  if (Mods & uint16_t(ModifierOptions::Unaligned))
+    TypeName.append("__unaligned ");
+  TypeName.append(ModifiedName);
+  Name = TypeDB.saveTypeName(TypeName);
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
+                                            VFTableShapeRecord &Shape) {
+  Name = TypeDB.saveTypeName("<vftable " + utostr(Shape.getEntryCount()) +
+                             " methods>");
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                            NestedTypeRecord &Nested) {
+  Name = Nested.getName();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                            OneMethodRecord &Method) {
+  Name = Method.getName();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                            OverloadedMethodRecord &Method) {
+  Name = Method.getName();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                            DataMemberRecord &Field) {
+  Name = Field.getName();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                            StaticDataMemberRecord &Field) {
+  Name = Field.getName();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                            EnumeratorRecord &Enum) {
+  Name = Enum.getName();
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                            BaseClassRecord &Base) {
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                            VirtualBaseClassRecord &VBase) {
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                            ListContinuationRecord &Cont) {
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(
+    CVType &CVR, UdtModSourceLineRecord &ModSourceLine) {
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
+                                            UdtSourceLineRecord &SourceLine) {
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, BitFieldRecord &BF) {
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(
+    CVType &CVR, MethodOverloadListRecord &Overloads) {
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, BuildInfoRecord &BI) {
+  return Error::success();
+}
+
+Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                            VFPtrRecord &VFP) {
+  return Error::success();
+}
diff --git a/lib/DebugInfo/CodeView/TypeDumper.cpp b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
index 4274d834076a..033585ba8cc9 100644
--- a/lib/DebugInfo/CodeView/TypeDumper.cpp
+++ b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
@@ -1,4 +1,5 @@
-//===-- TypeDumper.cpp - CodeView type info dumper --------------*- C++ -*-===//
+//===-- TypeDumpVisitor.cpp - CodeView type info dumper -----------*- C++
+//-*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,9 +8,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/CodeView/TypeDumper.h"
+#include "llvm/DebugInfo/CodeView/TypeDumpVisitor.h"
+
 #include "llvm/ADT/SmallString.h"
+#include "llvm/DebugInfo/CodeView/CVTypeDumper.h"
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
+#include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
@@ -20,52 +25,6 @@
 using namespace llvm;
 using namespace llvm::codeview;
 
-/// The names here all end in "*". If the simple type is a pointer type, we
-/// return the whole name. Otherwise we lop off the last character in our
-/// StringRef.
-static const EnumEntry<SimpleTypeKind> SimpleTypeNames[] = {
-    {"void*", SimpleTypeKind::Void},
-    {"<not translated>*", SimpleTypeKind::NotTranslated},
-    {"HRESULT*", SimpleTypeKind::HResult},
-    {"signed char*", SimpleTypeKind::SignedCharacter},
-    {"unsigned char*", SimpleTypeKind::UnsignedCharacter},
-    {"char*", SimpleTypeKind::NarrowCharacter},
-    {"wchar_t*", SimpleTypeKind::WideCharacter},
-    {"char16_t*", SimpleTypeKind::Character16},
-    {"char32_t*", SimpleTypeKind::Character32},
-    {"__int8*", SimpleTypeKind::SByte},
-    {"unsigned __int8*", SimpleTypeKind::Byte},
-    {"short*", SimpleTypeKind::Int16Short},
-    {"unsigned short*", SimpleTypeKind::UInt16Short},
-    {"__int16*", SimpleTypeKind::Int16},
-    {"unsigned __int16*", SimpleTypeKind::UInt16},
-    {"long*", SimpleTypeKind::Int32Long},
-    {"unsigned long*", SimpleTypeKind::UInt32Long},
-    {"int*", SimpleTypeKind::Int32},
-    {"unsigned*", SimpleTypeKind::UInt32},
-    {"__int64*", SimpleTypeKind::Int64Quad},
-    {"unsigned __int64*", SimpleTypeKind::UInt64Quad},
-    {"__int64*", SimpleTypeKind::Int64},
-    {"unsigned __int64*", SimpleTypeKind::UInt64},
-    {"__int128*", SimpleTypeKind::Int128},
-    {"unsigned __int128*", SimpleTypeKind::UInt128},
-    {"__half*", SimpleTypeKind::Float16},
-    {"float*", SimpleTypeKind::Float32},
-    {"float*", SimpleTypeKind::Float32PartialPrecision},
-    {"__float48*", SimpleTypeKind::Float48},
-    {"double*", SimpleTypeKind::Float64},
-    {"long double*", SimpleTypeKind::Float80},
-    {"__float128*", SimpleTypeKind::Float128},
-    {"_Complex float*", SimpleTypeKind::Complex32},
-    {"_Complex double*", SimpleTypeKind::Complex64},
-    {"_Complex long double*", SimpleTypeKind::Complex80},
-    {"_Complex __float128*", SimpleTypeKind::Complex128},
-    {"bool*", SimpleTypeKind::Boolean8},
-    {"__bool16*", SimpleTypeKind::Boolean16},
-    {"__bool32*", SimpleTypeKind::Boolean32},
-    {"__bool64*", SimpleTypeKind::Boolean64},
-};
-
 static const EnumEntry<TypeLeafKind> LeafTypeNames[] = {
 #define CV_TYPE(enum, val) {#enum, enum},
 #include "llvm/DebugInfo/CodeView/TypeRecords.def"
@@ -90,10 +49,8 @@ static const EnumEntry<uint16_t> ClassOptionNames[] = {
 };
 
 static const EnumEntry<uint8_t> MemberAccessNames[] = {
-    ENUM_ENTRY(MemberAccess, None),
-    ENUM_ENTRY(MemberAccess, Private),
-    ENUM_ENTRY(MemberAccess, Protected),
-    ENUM_ENTRY(MemberAccess, Public),
+    ENUM_ENTRY(MemberAccess, None), ENUM_ENTRY(MemberAccess, Private),
+    ENUM_ENTRY(MemberAccess, Protected), ENUM_ENTRY(MemberAccess, Public),
 };
 
 static const EnumEntry<uint16_t> MethodOptionNames[] = {
@@ -151,8 +108,7 @@ static const EnumEntry<uint16_t> PtrMemberRepNames[] = {
 };
 
 static const EnumEntry<uint16_t> TypeModifierNames[] = {
-    ENUM_ENTRY(ModifierOptions, Const),
-    ENUM_ENTRY(ModifierOptions, Volatile),
+    ENUM_ENTRY(ModifierOptions, Const), ENUM_ENTRY(ModifierOptions, Volatile),
     ENUM_ENTRY(ModifierOptions, Unaligned),
 };
 
@@ -203,38 +159,22 @@ static StringRef getLeafTypeName(TypeLeafKind LT) {
   return "UnknownLeaf";
 }
 
-Error CVTypeDumper::visitTypeBegin(CVRecord<TypeLeafKind> &Record) {
-  assert(!IsInFieldList);
-  // Reset Name to the empty string. If the visitor sets it, we know it.
-  Name = "";
+void TypeDumpVisitor::printTypeIndex(StringRef FieldName, TypeIndex TI) const {
+  CVTypeDumper::printTypeIndex(*W, FieldName, TI, TypeDB);
+}
 
+Error TypeDumpVisitor::visitTypeBegin(CVType &Record) {
   W->startLine() << getLeafTypeName(Record.Type);
-  W->getOStream() << " (" << HexNumber(getNextTypeIndex()) << ")";
+  W->getOStream() << " (" << HexNumber(TypeDB.getNextTypeIndex().getIndex())
+                  << ")";
   W->getOStream() << " {\n";
   W->indent();
   W->printEnum("TypeLeafKind", unsigned(Record.Type),
                makeArrayRef(LeafTypeNames));
-  if (Record.Type == LF_FIELDLIST) {
-    // Record that we're in a field list so that members do not get assigned
-    // type indices.
-    IsInFieldList = true;
-  }
   return Error::success();
 }
 
-Error CVTypeDumper::visitTypeEnd(CVRecord<TypeLeafKind> &Record) {
-  if (Record.Type == LF_FIELDLIST) {
-    assert(IsInFieldList);
-    IsInFieldList = false;
-  }
-  assert(!IsInFieldList);
-
-  // Record every type that is not a field list member, even if Name is empty.
-  // CVUDTNames is indexed by type index, and must have one entry for every
-  // type.  Field list members are not recorded, and are only referenced by
-  // their containing field list record.
-  recordType(Name);
-
+Error TypeDumpVisitor::visitTypeEnd(CVType &Record) {
   if (PrintRecordBytes)
     W->printBinaryBlock("LeafData", getBytesAsCharacters(Record.content()));
 
@@ -243,11 +183,7 @@ Error CVTypeDumper::visitTypeEnd(CVRecord<TypeLeafKind> &Record) {
   return Error::success();
 }
 
-Error CVTypeDumper::visitMemberBegin(CVMemberRecord &Record) {
-  assert(IsInFieldList);
-  // Reset Name to the empty string. If the visitor sets it, we know it.
-  Name = "";
-
+Error TypeDumpVisitor::visitMemberBegin(CVMemberRecord &Record) {
   W->startLine() << getLeafTypeName(Record.Kind);
   W->getOStream() << " {\n";
   W->indent();
@@ -256,8 +192,7 @@ Error CVTypeDumper::visitMemberBegin(CVMemberRecord &Record) {
   return Error::success();
 }
 
-Error CVTypeDumper::visitMemberEnd(CVMemberRecord &Record) {
-  assert(IsInFieldList);
+Error TypeDumpVisitor::visitMemberEnd(CVMemberRecord &Record) {
   if (PrintRecordBytes)
     W->printBinaryBlock("LeafData", getBytesAsCharacters(Record.Data));
 
@@ -266,46 +201,33 @@ Error CVTypeDumper::visitMemberEnd(CVMemberRecord &Record) {
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     FieldListRecord &FieldList) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
+                                        FieldListRecord &FieldList) {
   CVTypeVisitor Visitor(*this);
   if (auto EC = Visitor.visitFieldListMemberStream(FieldList.Data))
     return EC;
 
-  Name = "<field list>";
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     StringIdRecord &String) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, StringIdRecord &String) {
   printTypeIndex("Id", String.getId());
   W->printString("StringData", String.getString());
-  // Put this in CVUDTNames so it gets printed with LF_UDT_SRC_LINE.
-  Name = String.getString();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     ArgListRecord &Args) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, ArgListRecord &Args) {
   auto Indices = Args.getIndices();
   uint32_t Size = Indices.size();
   W->printNumber("NumArgs", Size);
   ListScope Arguments(*W, "Arguments");
-  SmallString<256> TypeName("(");
   for (uint32_t I = 0; I < Size; ++I) {
     printTypeIndex("ArgType", Indices[I]);
-    StringRef ArgTypeName = getTypeName(Indices[I]);
-    TypeName.append(ArgTypeName);
-    if (I + 1 != Size)
-      TypeName.append(", ");
   }
-  TypeName.push_back(')');
-  Name = saveName(TypeName);
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     ClassRecord &Class) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, ClassRecord &Class) {
   uint16_t Props = static_cast<uint16_t>(Class.getOptions());
   W->printNumber("MemberCount", Class.getMemberCount());
   W->printFlags("Properties", Props, makeArrayRef(ClassOptionNames));
@@ -316,12 +238,10 @@ Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
   W->printString("Name", Class.getName());
   if (Props & uint16_t(ClassOptions::HasUniqueName))
     W->printString("LinkageName", Class.getUniqueName());
-  Name = Class.getName();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     UnionRecord &Union) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, UnionRecord &Union) {
   uint16_t Props = static_cast<uint16_t>(Union.getOptions());
   W->printNumber("MemberCount", Union.getMemberCount());
   W->printFlags("Properties", Props, makeArrayRef(ClassOptionNames));
@@ -330,12 +250,10 @@ Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
   W->printString("Name", Union.getName());
   if (Props & uint16_t(ClassOptions::HasUniqueName))
     W->printString("LinkageName", Union.getUniqueName());
-  Name = Union.getName();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     EnumRecord &Enum) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, EnumRecord &Enum) {
   uint16_t Props = static_cast<uint16_t>(Enum.getOptions());
   W->printNumber("NumEnumerators", Enum.getMemberCount());
   W->printFlags("Properties", uint16_t(Enum.getOptions()),
@@ -345,43 +263,35 @@ Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
   W->printString("Name", Enum.getName());
   if (Props & uint16_t(ClassOptions::HasUniqueName))
     W->printString("LinkageName", Enum.getUniqueName());
-  Name = Enum.getName();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     ArrayRecord &AT) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, ArrayRecord &AT) {
   printTypeIndex("ElementType", AT.getElementType());
   printTypeIndex("IndexType", AT.getIndexType());
   W->printNumber("SizeOf", AT.getSize());
   W->printString("Name", AT.getName());
-  Name = AT.getName();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     VFTableRecord &VFT) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, VFTableRecord &VFT) {
   printTypeIndex("CompleteClass", VFT.getCompleteClass());
   printTypeIndex("OverriddenVFTable", VFT.getOverriddenVTable());
   W->printHex("VFPtrOffset", VFT.getVFPtrOffset());
   W->printString("VFTableName", VFT.getName());
   for (auto N : VFT.getMethodNames())
     W->printString("MethodName", N);
-  Name = VFT.getName();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     MemberFuncIdRecord &Id) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, MemberFuncIdRecord &Id) {
   printTypeIndex("ClassType", Id.getClassType());
   printTypeIndex("FunctionType", Id.getFunctionType());
   W->printString("Name", Id.getName());
-  Name = Id.getName();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     ProcedureRecord &Proc) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, ProcedureRecord &Proc) {
   printTypeIndex("ReturnType", Proc.getReturnType());
   W->printEnum("CallingConvention", uint8_t(Proc.getCallConv()),
                makeArrayRef(CallingConventions));
@@ -389,18 +299,10 @@ Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
                 makeArrayRef(FunctionOptionEnum));
   W->printNumber("NumParameters", Proc.getParameterCount());
   printTypeIndex("ArgListType", Proc.getArgumentList());
-
-  StringRef ReturnTypeName = getTypeName(Proc.getReturnType());
-  StringRef ArgListTypeName = getTypeName(Proc.getArgumentList());
-  SmallString<256> TypeName(ReturnTypeName);
-  TypeName.push_back(' ');
-  TypeName.append(ArgListTypeName);
-  Name = saveName(TypeName);
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     MemberFunctionRecord &MF) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, MemberFunctionRecord &MF) {
   printTypeIndex("ReturnType", MF.getReturnType());
   printTypeIndex("ClassType", MF.getClassType());
   printTypeIndex("ThisType", MF.getThisType());
@@ -411,21 +313,11 @@ Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
   W->printNumber("NumParameters", MF.getParameterCount());
   printTypeIndex("ArgListType", MF.getArgumentList());
   W->printNumber("ThisAdjustment", MF.getThisPointerAdjustment());
-
-  StringRef ReturnTypeName = getTypeName(MF.getReturnType());
-  StringRef ClassTypeName = getTypeName(MF.getClassType());
-  StringRef ArgListTypeName = getTypeName(MF.getArgumentList());
-  SmallString<256> TypeName(ReturnTypeName);
-  TypeName.push_back(' ');
-  TypeName.append(ClassTypeName);
-  TypeName.append("::");
-  TypeName.append(ArgListTypeName);
-  Name = saveName(TypeName);
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     MethodOverloadListRecord &MethodList) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
+                                        MethodOverloadListRecord &MethodList) {
   for (auto &M : MethodList.getMethods()) {
     ListScope S(*W, "Method");
     printMemberAttributes(M.getAccess(), M.getMethodKind(), M.getOptions());
@@ -436,26 +328,21 @@ Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     FuncIdRecord &Func) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, FuncIdRecord &Func) {
   printTypeIndex("ParentScope", Func.getParentScope());
   printTypeIndex("FunctionType", Func.getFunctionType());
   W->printString("Name", Func.getName());
-  Name = Func.getName();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     TypeServer2Record &TS) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, TypeServer2Record &TS) {
   W->printBinary("Signature", TS.getGuid());
   W->printNumber("Age", TS.getAge());
   W->printString("Name", TS.getName());
-  Name = TS.getName();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     PointerRecord &Ptr) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
   printTypeIndex("PointeeType", Ptr.getReferentType());
   W->printHex("PointerAttributes", uint32_t(Ptr.getOptions()));
   W->printEnum("PtrType", unsigned(Ptr.getPointerKind()),
@@ -474,82 +361,42 @@ Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
     printTypeIndex("ClassType", MI.getContainingType());
     W->printEnum("Representation", uint16_t(MI.getRepresentation()),
                  makeArrayRef(PtrMemberRepNames));
-
-    StringRef PointeeName = getTypeName(Ptr.getReferentType());
-    StringRef ClassName = getTypeName(MI.getContainingType());
-    SmallString<256> TypeName(PointeeName);
-    TypeName.push_back(' ');
-    TypeName.append(ClassName);
-    TypeName.append("::*");
-    Name = saveName(TypeName);
-  } else {
-    SmallString<256> TypeName;
-    if (Ptr.isConst())
-      TypeName.append("const ");
-    if (Ptr.isVolatile())
-      TypeName.append("volatile ");
-    if (Ptr.isUnaligned())
-      TypeName.append("__unaligned ");
-
-    TypeName.append(getTypeName(Ptr.getReferentType()));
-
-    if (Ptr.getMode() == PointerMode::LValueReference)
-      TypeName.append("&");
-    else if (Ptr.getMode() == PointerMode::RValueReference)
-      TypeName.append("&&");
-    else if (Ptr.getMode() == PointerMode::Pointer)
-      TypeName.append("*");
-
-    if (!TypeName.empty())
-      Name = saveName(TypeName);
   }
+
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     ModifierRecord &Mod) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, ModifierRecord &Mod) {
   uint16_t Mods = static_cast<uint16_t>(Mod.getModifiers());
   printTypeIndex("ModifiedType", Mod.getModifiedType());
   W->printFlags("Modifiers", Mods, makeArrayRef(TypeModifierNames));
 
-  StringRef ModifiedName = getTypeName(Mod.getModifiedType());
-  SmallString<256> TypeName;
-  if (Mods & uint16_t(ModifierOptions::Const))
-    TypeName.append("const ");
-  if (Mods & uint16_t(ModifierOptions::Volatile))
-    TypeName.append("volatile ");
-  if (Mods & uint16_t(ModifierOptions::Unaligned))
-    TypeName.append("__unaligned ");
-  TypeName.append(ModifiedName);
-  Name = saveName(TypeName);
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     BitFieldRecord &BitField) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, BitFieldRecord &BitField) {
   printTypeIndex("Type", BitField.getType());
   W->printNumber("BitSize", BitField.getBitSize());
   W->printNumber("BitOffset", BitField.getBitOffset());
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     VFTableShapeRecord &Shape) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
+                                        VFTableShapeRecord &Shape) {
   W->printNumber("VFEntryCount", Shape.getEntryCount());
-  Name = saveName("<vftable " + utostr(Shape.getEntryCount()) + " methods>");
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     UdtSourceLineRecord &Line) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
+                                        UdtSourceLineRecord &Line) {
   printTypeIndex("UDT", Line.getUDT());
   printTypeIndex("SourceFile", Line.getSourceFile());
   W->printNumber("LineNumber", Line.getLineNumber());
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     UdtModSourceLineRecord &Line) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
+                                        UdtModSourceLineRecord &Line) {
   printTypeIndex("UDT", Line.getUDT());
   printTypeIndex("SourceFile", Line.getSourceFile());
   W->printNumber("LineNumber", Line.getLineNumber());
@@ -557,8 +404,7 @@ Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                     BuildInfoRecord &Args) {
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, BuildInfoRecord &Args) {
   W->printNumber("NumArgs", static_cast<uint32_t>(Args.getArgs().size()));
 
   ListScope Arguments(*W, "Arguments");
@@ -568,13 +414,14 @@ Error CVTypeDumper::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
   return Error::success();
 }
 
-void CVTypeDumper::printMemberAttributes(MemberAttributes Attrs) {
+void TypeDumpVisitor::printMemberAttributes(MemberAttributes Attrs) {
   return printMemberAttributes(Attrs.getAccess(), Attrs.getMethodKind(),
                                Attrs.getFlags());
 }
 
-void CVTypeDumper::printMemberAttributes(MemberAccess Access, MethodKind Kind,
-                                         MethodOptions Options) {
+void TypeDumpVisitor::printMemberAttributes(MemberAccess Access,
+                                            MethodKind Kind,
+                                            MethodOptions Options) {
   W->printEnum("AccessSpecifier", uint8_t(Access),
                makeArrayRef(MemberAccessNames));
   // Data members will be vanilla. Don't try to print a method kind for them.
@@ -586,27 +433,26 @@ void CVTypeDumper::printMemberAttributes(MemberAccess Access, MethodKind Kind,
   }
 }
 
-Error CVTypeDumper::visitUnknownMember(CVMemberRecord &Record) {
+Error TypeDumpVisitor::visitUnknownMember(CVMemberRecord &Record) {
   W->printHex("UnknownMember", unsigned(Record.Kind));
   return Error::success();
 }
 
-Error CVTypeDumper::visitUnknownType(CVRecord<TypeLeafKind> &Record) {
+Error TypeDumpVisitor::visitUnknownType(CVType &Record) {
   W->printEnum("Kind", uint16_t(Record.kind()), makeArrayRef(LeafTypeNames));
   W->printNumber("Length", uint32_t(Record.content().size()));
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownMember(CVMemberRecord &CVR,
-                                     NestedTypeRecord &Nested) {
+Error TypeDumpVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                        NestedTypeRecord &Nested) {
   printTypeIndex("Type", Nested.getNestedType());
   W->printString("Name", Nested.getName());
-  Name = Nested.getName();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownMember(CVMemberRecord &CVR,
-                                     OneMethodRecord &Method) {
+Error TypeDumpVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                        OneMethodRecord &Method) {
   MethodKind K = Method.getMethodKind();
   printMemberAttributes(Method.getAccess(), K, Method.getOptions());
   printTypeIndex("Type", Method.getType());
@@ -614,58 +460,53 @@ Error CVTypeDumper::visitKnownMember(CVMemberRecord &CVR,
   if (Method.isIntroducingVirtual())
     W->printHex("VFTableOffset", Method.getVFTableOffset());
   W->printString("Name", Method.getName());
-  Name = Method.getName();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownMember(CVMemberRecord &CVR,
-                                     OverloadedMethodRecord &Method) {
+Error TypeDumpVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                        OverloadedMethodRecord &Method) {
   W->printHex("MethodCount", Method.getNumOverloads());
   printTypeIndex("MethodListIndex", Method.getMethodList());
   W->printString("Name", Method.getName());
-  Name = Method.getName();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownMember(CVMemberRecord &CVR,
-                                     DataMemberRecord &Field) {
+Error TypeDumpVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                        DataMemberRecord &Field) {
   printMemberAttributes(Field.getAccess(), MethodKind::Vanilla,
                         MethodOptions::None);
   printTypeIndex("Type", Field.getType());
   W->printHex("FieldOffset", Field.getFieldOffset());
   W->printString("Name", Field.getName());
-  Name = Field.getName();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownMember(CVMemberRecord &CVR,
-                                     StaticDataMemberRecord &Field) {
+Error TypeDumpVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                        StaticDataMemberRecord &Field) {
   printMemberAttributes(Field.getAccess(), MethodKind::Vanilla,
                         MethodOptions::None);
   printTypeIndex("Type", Field.getType());
   W->printString("Name", Field.getName());
-  Name = Field.getName();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownMember(CVMemberRecord &CVR,
-                                     VFPtrRecord &VFTable) {
+Error TypeDumpVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                        VFPtrRecord &VFTable) {
   printTypeIndex("Type", VFTable.getType());
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownMember(CVMemberRecord &CVR,
-                                     EnumeratorRecord &Enum) {
+Error TypeDumpVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                        EnumeratorRecord &Enum) {
   printMemberAttributes(Enum.getAccess(), MethodKind::Vanilla,
                         MethodOptions::None);
   W->printNumber("EnumValue", Enum.getValue());
   W->printString("Name", Enum.getName());
-  Name = Enum.getName();
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownMember(CVMemberRecord &CVR,
-                                     BaseClassRecord &Base) {
+Error TypeDumpVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                        BaseClassRecord &Base) {
   printMemberAttributes(Base.getAccess(), MethodKind::Vanilla,
                         MethodOptions::None);
   printTypeIndex("BaseType", Base.getBaseType());
@@ -673,8 +514,8 @@ Error CVTypeDumper::visitKnownMember(CVMemberRecord &CVR,
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownMember(CVMemberRecord &CVR,
-                                     VirtualBaseClassRecord &Base) {
+Error TypeDumpVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                        VirtualBaseClassRecord &Base) {
   printMemberAttributes(Base.getAccess(), MethodKind::Vanilla,
                         MethodOptions::None);
   printTypeIndex("BaseType", Base.getBaseType());
@@ -684,89 +525,8 @@ Error CVTypeDumper::visitKnownMember(CVMemberRecord &CVR,
   return Error::success();
 }
 
-Error CVTypeDumper::visitKnownMember(CVMemberRecord &CVR,
-                                     ListContinuationRecord &Cont) {
+Error TypeDumpVisitor::visitKnownMember(CVMemberRecord &CVR,
+                                        ListContinuationRecord &Cont) {
   printTypeIndex("ContinuationIndex", Cont.getContinuationIndex());
   return Error::success();
 }
-
-StringRef CVTypeDumper::getTypeName(TypeIndex TI) {
-  if (TI.isNoneType())
-    return "<no type>";
-
-  if (TI.isSimple()) {
-    // This is a simple type.
-    for (const auto &SimpleTypeName : SimpleTypeNames) {
-      if (SimpleTypeName.Value == TI.getSimpleKind()) {
-        if (TI.getSimpleMode() == SimpleTypeMode::Direct)
-          return SimpleTypeName.Name.drop_back(1);
-        // Otherwise, this is a pointer type. We gloss over the distinction
-        // between near, far, 64, 32, etc, and just give a pointer type.
-        return SimpleTypeName.Name;
-      }
-    }
-    return "<unknown simple type>";
-  }
-
-  // User-defined type.
-  StringRef UDTName;
-  unsigned UDTIndex = TI.getIndex() - 0x1000;
-  if (UDTIndex < CVUDTNames.size())
-    return CVUDTNames[UDTIndex];
-
-  return "<unknown UDT>";
-}
-
-void CVTypeDumper::printTypeIndex(StringRef FieldName, TypeIndex TI) {
-  StringRef TypeName;
-  if (!TI.isNoneType())
-    TypeName = getTypeName(TI);
-  if (!TypeName.empty())
-    W->printHex(FieldName, TypeName, TI.getIndex());
-  else
-    W->printHex(FieldName, TI.getIndex());
-}
-
-Error CVTypeDumper::dump(const CVRecord<TypeLeafKind> &Record) {
-  assert(W && "printer should not be null");
-  TypeDeserializer Deserializer;
-  TypeVisitorCallbackPipeline Pipeline;
-  Pipeline.addCallbackToPipeline(Deserializer);
-  Pipeline.addCallbackToPipeline(*this);
-
-  CVTypeVisitor Visitor(Pipeline);
-
-  CVRecord<TypeLeafKind> RecordCopy = Record;
-  if (auto EC = Visitor.visitTypeRecord(RecordCopy))
-    return EC;
-  return Error::success();
-}
-
-Error CVTypeDumper::dump(const CVTypeArray &Types) {
-  assert(W && "printer should not be null");
-  TypeDeserializer Deserializer;
-  TypeVisitorCallbackPipeline Pipeline;
-  Pipeline.addCallbackToPipeline(Deserializer);
-  Pipeline.addCallbackToPipeline(*this);
-
-  CVTypeVisitor Visitor(Pipeline);
-
-  if (auto EC = Visitor.visitTypeStream(Types))
-    return EC;
-  return Error::success();
-}
-
-Error CVTypeDumper::dump(ArrayRef<uint8_t> Data) {
-  msf::ByteStream Stream(Data);
-  CVTypeArray Types;
-  msf::StreamReader Reader(Stream);
-  if (auto EC = Reader.readArray(Types, Reader.getLength()))
-    return EC;
-
-  return dump(Types);
-}
-
-void CVTypeDumper::setPrinter(ScopedPrinter *P) {
-  static ScopedPrinter NullP(llvm::nulls());
-  W = P ? P : &NullP;
-}
diff --git a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
index 6126470aa099..08bc74a81e9a 100644
--- a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
+++ b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
@@ -56,13 +56,20 @@ DWARFAbbreviationDeclaration::extract(DataExtractor Data,
     auto A = static_cast<Attribute>(Data.getULEB128(OffsetPtr));
     auto F = static_cast<Form>(Data.getULEB128(OffsetPtr));
     if (A && F) {
-      auto FixedFormByteSize = DWARFFormValue::getFixedByteSize(F);
-      AttributeSpecs.push_back(AttributeSpec(A, F, FixedFormByteSize));
+      Optional<int64_t> V;
+      bool IsImplicitConst = (F == DW_FORM_implicit_const);
+      if (IsImplicitConst)
+        V = Data.getSLEB128(OffsetPtr);
+      else if (auto Size = DWARFFormValue::getFixedByteSize(F))
+        V = *Size;
+      AttributeSpecs.push_back(AttributeSpec(A, F, V));
+      if (IsImplicitConst)
+        continue;
       // If this abbrevation still has a fixed byte size, then update the
       // FixedAttributeSize as needed.
       if (FixedAttributeSize) {
-        if (FixedFormByteSize)
-          FixedAttributeSize->NumBytes += *FixedFormByteSize;
+        if (V)
+          FixedAttributeSize->NumBytes += *V;
         else {
           switch (F) {
           case DW_FORM_addr:
@@ -129,6 +136,8 @@ void DWARFAbbreviationDeclaration::dump(raw_ostream &OS) const {
       OS << formString;
     else
       OS << format("DW_FORM_Unknown_%x", Spec.Form);
+    if (Spec.isImplicitConst())
+      OS << '\t' << *Spec.ByteSizeOrValue;
     OS << '\n';
   }
   OS << '\n';
@@ -160,11 +169,15 @@ Optional<DWARFFormValue> DWARFAbbreviationDeclaration::getAttributeValue(
     if (*MatchAttrIndex == AttrIndex) {
       // We have arrived at the attribute to extract, extract if from Offset.
       DWARFFormValue FormValue(Spec.Form);
+      if (Spec.isImplicitConst()) {
+        FormValue.setSValue(*Spec.ByteSizeOrValue);
+        return FormValue;
+      }
       if (FormValue.extractValue(DebugInfoData, &Offset, &U))
         return FormValue;
     }
     // March Offset along until we get to the attribute we want.
-    if (Optional<uint8_t> FixedSize = Spec.getByteSize(U))
+    if (auto FixedSize = Spec.getByteSize(U))
       Offset += *FixedSize;
     else
       DWARFFormValue::skipValue(Spec.Form, DebugInfoData, &Offset, &U);
@@ -185,9 +198,17 @@ size_t DWARFAbbreviationDeclaration::FixedSizeInfo::getByteSize(
   return ByteSize;
 }
 
-Optional<uint8_t> DWARFAbbreviationDeclaration::AttributeSpec::getByteSize(
+Optional<int64_t> DWARFAbbreviationDeclaration::AttributeSpec::getByteSize(
     const DWARFUnit &U) const {
-  return ByteSize ? ByteSize : DWARFFormValue::getFixedByteSize(Form, &U);
+  if (isImplicitConst())
+    return 0;
+  if (ByteSizeOrValue)
+    return ByteSizeOrValue;
+  Optional<int64_t> S;
+  auto FixedByteSize = DWARFFormValue::getFixedByteSize(Form, &U);
+  if (FixedByteSize)
+    S = *FixedByteSize;
+  return S;
 }
 
 Optional<size_t> DWARFAbbreviationDeclaration::getFixedAttributesByteSize(
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index 7df66c76e8b5..77f6f65ee131 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -14,6 +14,7 @@
 #include "llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugPubTable.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
+#include "llvm/Object/Decompressor.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Object/RelocVisitor.h"
 #include "llvm/Support/Compression.h"
@@ -577,66 +578,6 @@ DWARFContext::getInliningInfoForAddress(uint64_t Address,
   return InliningInfo;
 }
 
-static bool consumeCompressedGnuHeader(StringRef &data,
-                                       uint64_t &OriginalSize) {
-  // Consume "ZLIB" prefix.
-  if (!data.startswith("ZLIB"))
-    return false;
-  data = data.substr(4);
-  // Consume uncompressed section size (big-endian 8 bytes).
-  DataExtractor extractor(data, false, 8);
-  uint32_t Offset = 0;
-  OriginalSize = extractor.getU64(&Offset);
-  if (Offset == 0)
-    return false;
-  data = data.substr(Offset);
-  return true;
-}
-
-static bool consumeCompressedZLibHeader(StringRef &Data, uint64_t &OriginalSize,
-                                        bool IsLE, bool Is64Bit) {
-  using namespace ELF;
-  uint64_t HdrSize = Is64Bit ? sizeof(Elf64_Chdr) : sizeof(Elf32_Chdr);
-  if (Data.size() < HdrSize)
-    return false;
-
-  DataExtractor Extractor(Data, IsLE, 0);
-  uint32_t Offset = 0;
-  if (Extractor.getUnsigned(&Offset, Is64Bit ? sizeof(Elf64_Word)
-                                             : sizeof(Elf32_Word)) !=
-      ELFCOMPRESS_ZLIB)
-    return false;
-
-  // Skip Elf64_Chdr::ch_reserved field.
-  if (Is64Bit)
-    Offset += sizeof(Elf64_Word);
-
-  OriginalSize = Extractor.getUnsigned(&Offset, Is64Bit ? sizeof(Elf64_Xword)
-                                                        : sizeof(Elf32_Word));
-  Data = Data.substr(HdrSize);
-  return true;
-}
-
-static bool tryDecompress(StringRef &Name, StringRef &Data,
-                          SmallString<32> &Out, bool ZLibStyle, bool IsLE,
-                          bool Is64Bit) {
-  if (!zlib::isAvailable())
-    return false;
-
-  uint64_t OriginalSize;
-  bool Result =
-      ZLibStyle ? consumeCompressedZLibHeader(Data, OriginalSize, IsLE, Is64Bit)
-                : consumeCompressedGnuHeader(Data, OriginalSize);
-
-  if (!Result || zlib::uncompress(Data, Out, OriginalSize) != zlib::StatusOK)
-    return false;
-
-  // gnu-style names are started from "z", consume that.
-  if (!ZLibStyle)
-    Name = Name.substr(1);
-  return true;
-}
-
 DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
     const LoadedObjectInfo *L)
     : IsLittleEndian(Obj.isLittleEndian()),
@@ -660,18 +601,23 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
     if (!L || !L->getLoadedSectionContents(*RelocatedSection,data))
       Section.getContents(data);
 
-    name = name.substr(name.find_first_not_of("._")); // Skip . and _ prefixes.
-
-    bool ZLibStyleCompressed = Section.isCompressed();
-    if (ZLibStyleCompressed || name.startswith("zdebug_")) {
+    if (Decompressor::isCompressed(Section)) {
+      Expected<Decompressor> Decompressor =
+          Decompressor::create(name, data, IsLittleEndian, AddressSize == 8);
+      if (!Decompressor)
+        continue;
       SmallString<32> Out;
-      if (!tryDecompress(name, data, Out, ZLibStyleCompressed, IsLittleEndian,
-                         AddressSize == 8))
+      if (auto Err = Decompressor->decompress(Out))
         continue;
       UncompressedSections.emplace_back(std::move(Out));
       data = UncompressedSections.back();
     }
 
+    // Compressed sections names in GNU style starts from ".z",
+    // at this point section is decompressed and we drop compression prefix.
+    name = name.substr(
+        name.find_first_not_of("._z")); // Skip ".", "z" and "_" prefixes.
+
     StringRef *SectionData =
         StringSwitch<StringRef *>(name)
             .Case("debug_info", &InfoSection.Data)
diff --git a/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp b/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
index 9f623e4954c8..c487e1dca7c6 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
@@ -57,7 +57,7 @@ bool DWARFDebugInfoEntry::extractFast(const DWARFUnit &U, uint32_t *OffsetPtr,
   // Skip all data in the .debug_info for the attributes
   for (const auto &AttrSpec : AbbrevDecl->attributes()) {
     // Check if this attribute has a fixed byte size.
-    if (Optional<uint8_t> FixedSize = AttrSpec.getByteSize(U)) {
+    if (auto FixedSize = AttrSpec.getByteSize(U)) {
       // Attribute byte size if fixed, just add the size to the offset.
       *OffsetPtr += *FixedSize;
     } else if (!DWARFFormValue::skipValue(AttrSpec.Form, DebugInfoData,
diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp
index 2aac3474654f..89b83b11ab68 100644
--- a/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -152,13 +152,6 @@ const char *DWARFDie::getAttributeValueAsString(dwarf::Attribute Attr,
   return Result.hasValue() ? Result.getValue() : FailValue;
 }
 
-uint64_t DWARFDie::getAttributeValueAsAddress(dwarf::Attribute Attr,
-                                              uint64_t FailValue) const {
-  if (auto Value = getAttributeValueAsAddress(Attr))
-    return *Value;
-  return FailValue;
-}
-
 Optional<uint64_t>
 DWARFDie::getAttributeValueAsAddress(dwarf::Attribute Attr) const {
   if (auto FormValue = getAttributeValue(Attr))
@@ -166,13 +159,6 @@ DWARFDie::getAttributeValueAsAddress(dwarf::Attribute Attr) const {
   return None;
 }
 
-int64_t DWARFDie::getAttributeValueAsSignedConstant(dwarf::Attribute Attr,
-                                                    int64_t FailValue) const {
-  if (auto Value = getAttributeValueAsSignedConstant(Attr))
-    return *Value;
-  return FailValue;
-}
-
 Optional<int64_t>
 DWARFDie::getAttributeValueAsSignedConstant(dwarf::Attribute Attr) const {
   if (auto FormValue = getAttributeValue(Attr))
@@ -180,15 +166,6 @@ DWARFDie::getAttributeValueAsSignedConstant(dwarf::Attribute Attr) const {
   return None;
 }
 
-uint64_t
-DWARFDie::getAttributeValueAsUnsignedConstant(dwarf::Attribute Attr,
-                                              uint64_t FailValue) const {
-  if (auto Value = getAttributeValueAsUnsignedConstant(Attr))
-    return *Value;
-  return FailValue;
-}
-
-
 Optional<uint64_t>
 DWARFDie::getAttributeValueAsUnsignedConstant(dwarf::Attribute Attr) const {
   if (auto FormValue = getAttributeValue(Attr))
@@ -196,14 +173,6 @@ DWARFDie::getAttributeValueAsUnsignedConstant(dwarf::Attribute Attr) const {
   return None;
 }
 
-uint64_t DWARFDie::getAttributeValueAsReference(dwarf::Attribute Attr,
-                                                uint64_t FailValue) const {
-  if (auto Value = getAttributeValueAsReference(Attr))
-    return *Value;
-  return FailValue;
-}
-
-
 Optional<uint64_t>
 DWARFDie::getAttributeValueAsReference(dwarf::Attribute Attr) const {
   if (auto FormValue = getAttributeValue(Attr))
@@ -211,13 +180,6 @@ DWARFDie::getAttributeValueAsReference(dwarf::Attribute Attr) const {
   return None;
 }
 
-uint64_t DWARFDie::getAttributeValueAsSectionOffset(dwarf::Attribute Attr,
-                                                    uint64_t FailValue) const {
-  if (auto Value = getAttributeValueAsSectionOffset(Attr))
-    return *Value;
-  return FailValue;
-}
-
 Optional<uint64_t>
 DWARFDie::getAttributeValueAsSectionOffset(dwarf::Attribute Attr) const {
   if (auto FormValue = getAttributeValue(Attr))
@@ -345,9 +307,10 @@ DWARFDie::getName(DINameKind Kind) const {
 
 void DWARFDie::getCallerFrame(uint32_t &CallFile, uint32_t &CallLine,
                               uint32_t &CallColumn) const {
-  CallFile = getAttributeValueAsUnsignedConstant(DW_AT_call_file, 0);
-  CallLine = getAttributeValueAsUnsignedConstant(DW_AT_call_line, 0);
-  CallColumn = getAttributeValueAsUnsignedConstant(DW_AT_call_column, 0);
+  CallFile = getAttributeValueAsUnsignedConstant(DW_AT_call_file).getValueOr(0);
+  CallLine = getAttributeValueAsUnsignedConstant(DW_AT_call_line).getValueOr(0);
+  CallColumn =
+      getAttributeValueAsUnsignedConstant(DW_AT_call_column).getValueOr(0);
 }
 
 void DWARFDie::dump(raw_ostream &OS, unsigned RecurseDepth,
diff --git a/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
index e48a6f0981b7..dc9310dc4e89 100644
--- a/lib/DebugInfo/DWARF/DWARFFormValue.cpp
+++ b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
@@ -153,7 +153,7 @@ static Optional<uint8_t> getFixedByteSize(dwarf::Form Form, const T *U) {
       return 16;
 
     case DW_FORM_implicit_const:
-      // The implicit value is stored in the abbreviation as a ULEB128, any
+      // The implicit value is stored in the abbreviation as a SLEB128, and
       // there no data in debug info.
       return 0;
 
@@ -280,6 +280,8 @@ bool DWARFFormValue::isFormClass(DWARFFormValue::FormClass FC) const {
   case DW_FORM_GNU_str_index:
   case DW_FORM_GNU_strp_alt:
     return (FC == FC_String);
+  case DW_FORM_implicit_const:
+    return (FC == FC_Constant);
   default:
     break;
   }
diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp
index 63fb0d3bc368..ee2c569b0bce 100644
--- a/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -230,10 +230,12 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
       BaseAddr = UnitDie.getAttributeValueAsAddress(DW_AT_entry_pc);
     if (BaseAddr)
       setBaseAddress(*BaseAddr);
-    AddrOffsetSectionBase = UnitDie.getAttributeValueAsSectionOffset(
-        DW_AT_GNU_addr_base, 0);
-    RangeSectionBase = UnitDie.getAttributeValueAsSectionOffset(
-        DW_AT_rnglists_base, 0);
+    AddrOffsetSectionBase =
+        UnitDie.getAttributeValueAsSectionOffset(DW_AT_GNU_addr_base)
+            .getValueOr(0);
+    RangeSectionBase =
+        UnitDie.getAttributeValueAsSectionOffset(DW_AT_rnglists_base)
+            .getValueOr(0);
     // Don't fall back to DW_AT_GNU_ranges_base: it should be ignored for
     // skeleton CU DIE, so that DWARF users not aware of it are not broken.
   }
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 8f6b1849169a..05615d3cc6cf 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -374,6 +374,9 @@ void RuntimeDyldELF::resolveAArch64Relocation(const SectionEntry &Section,
     write(isBE, TargetPtr, static_cast<uint32_t>(Result & 0xffffffffU));
     break;
   }
+  case ELF::R_AARCH64_PREL64:
+    write(isBE, TargetPtr, Value + Addend - FinalAddress);
+    break;
   case ELF::R_AARCH64_CALL26: // fallthrough
   case ELF::R_AARCH64_JUMP26: {
     // Operation: S+A-P. Set Call or B immediate value to bits fff_fffc of the
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index a87b9bec1ed2..e3a7bae02e0a 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -77,6 +77,11 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
   switch (Name[0]) {
   default: break;
   case 'a': {
+    if (Name.startswith("arm.rbit") || Name.startswith("aarch64.rbit")) {
+      NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::bitreverse,
+                                        F->arg_begin()->getType());
+      return true;
+    }
     if (Name.startswith("arm.neon.vclz")) {
       Type* args[2] = {
         F->arg_begin()->getType(),
@@ -1761,6 +1766,11 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     return;
   }
 
+  case Intrinsic::bitreverse:
+    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, {CI->getArgOperand(0)}));
+    CI->eraseFromParent();
+    return;
+
   case Intrinsic::ctlz:
   case Intrinsic::cttz:
     assert(CI->getNumArgOperands() == 1 &&
diff --git a/lib/IR/DIBuilder.cpp b/lib/IR/DIBuilder.cpp
index 2ea572490b6d..d06161067f5f 100644
--- a/lib/IR/DIBuilder.cpp
+++ b/lib/IR/DIBuilder.cpp
@@ -90,6 +90,20 @@ void DIBuilder::finalize() {
         VMContext, SmallVector<Metadata *, 16>(AllImportedModules.begin(),
                                                AllImportedModules.end())));
 
+  for (const auto &I : AllMacrosPerParent) {
+    // DIMacroNode's with nullptr parent are DICompileUnit direct children.
+    if (!I.first) {
+      CUNode->replaceMacros(MDTuple::get(VMContext, I.second.getArrayRef()));
+      continue;
+    }
+    // Otherwise, it must be a temporary DIMacroFile that need to be resolved.
+    auto *TMF = cast<DIMacroFile>(I.first);
+    auto *MF = DIMacroFile::get(VMContext, dwarf::DW_MACINFO_start_file,
+                                TMF->getLine(), TMF->getFile(),
+                                getOrCreateMacroArray(I.second.getArrayRef()));
+    replaceTemporary(llvm::TempDIMacroNode(TMF), MF);
+  }
+
   // Now that all temp nodes have been replaced or deleted, resolve remaining
   // cycles.
   for (const auto &N : UnresolvedNodes)
@@ -179,6 +193,31 @@ DIFile *DIBuilder::createFile(StringRef Filename, StringRef Directory,
   return DIFile::get(VMContext, Filename, Directory, CSKind, Checksum);
 }
 
+DIMacro *DIBuilder::createMacro(DIMacroFile *Parent, unsigned LineNumber,
+                                unsigned MacroType, StringRef Name,
+                                StringRef Value) {
+  assert(!Name.empty() && "Unable to create macro without name");
+  assert((MacroType == dwarf::DW_MACINFO_undef ||
+          MacroType == dwarf::DW_MACINFO_define) &&
+         "Unexpected macro type");
+  auto *M = DIMacro::get(VMContext, MacroType, LineNumber, Name, Value);
+  AllMacrosPerParent[Parent].insert(M);
+  return M;
+}
+
+DIMacroFile *DIBuilder::createTempMacroFile(DIMacroFile *Parent,
+                                            unsigned LineNumber, DIFile *File) {
+  auto *MF = DIMacroFile::getTemporary(VMContext, dwarf::DW_MACINFO_start_file,
+                                       LineNumber, File, DIMacroNodeArray())
+                 .release();
+  AllMacrosPerParent[Parent].insert(MF);
+  // Add the new temporary DIMacroFile to the macro per parent map as a parent.
+  // This is needed to assure DIMacroFile with no children to have an entry in
+  // the map. Otherwise, it will not be resolved in DIBuilder::finalize().
+  AllMacrosPerParent.insert({MF, {}});
+  return MF;
+}
+
 DIEnumerator *DIBuilder::createEnumerator(StringRef Name, int64_t Val) {
   assert(!Name.empty() && "Unable to create enumerator without name");
   return DIEnumerator::get(VMContext, Val, Name);
@@ -509,6 +548,11 @@ DINodeArray DIBuilder::getOrCreateArray(ArrayRef<Metadata *> Elements) {
   return MDTuple::get(VMContext, Elements);
 }
 
+DIMacroNodeArray
+DIBuilder::getOrCreateMacroArray(ArrayRef<Metadata *> Elements) {
+  return MDTuple::get(VMContext, Elements);
+}
+
 DITypeRefArray DIBuilder::getOrCreateTypeArray(ArrayRef<Metadata *> Elements) {
   SmallVector<llvm::Metadata *, 16> Elts;
   for (unsigned i = 0, e = Elements.size(); i != e; ++i) {
diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp
index 31f89514151c..6f7356524d38 100644
--- a/lib/IR/Globals.cpp
+++ b/lib/IR/Globals.cpp
@@ -24,6 +24,7 @@
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "LLVMContextImpl.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -37,6 +38,10 @@ static_assert(sizeof(GlobalValue) ==
                   sizeof(Constant) + 2 * sizeof(void *) + 2 * sizeof(unsigned),
               "unexpected GlobalValue size growth");
 
+// GlobalObject adds a comdat.
+static_assert(sizeof(GlobalObject) == sizeof(GlobalValue) + sizeof(void *),
+              "unexpected GlobalObject size growth");
+
 bool GlobalValue::isMaterializable() const {
   if (const Function *F = dyn_cast<Function>(this))
     return F->isMaterializable();
@@ -160,11 +165,24 @@ Comdat *GlobalValue::getComdat() {
   return cast<GlobalObject>(this)->getComdat();
 }
 
-void GlobalObject::setSection(StringRef S) {
-  Section = S;
+StringRef GlobalObject::getSectionImpl() const {
+  assert(hasSection());
+  return getContext().pImpl->GlobalObjectSections[this];
+}
 
-  // The C api requires this to be null terminated.
-  Section.c_str();
+void GlobalObject::setSection(StringRef S) {
+  // Do nothing if we're clearing the section and it is already empty.
+  if (!hasSection() && S.empty())
+    return;
+
+  // Get or create a stable section name string and put it in the table in the
+  // context.
+  S = getContext().pImpl->SectionStrings.insert(S).first->first();
+  getContext().pImpl->GlobalObjectSections[this] = S;
+
+  // Update the HasSectionHashEntryBit. Setting the section to the empty string
+  // means this global no longer has a section.
+  setGlobalObjectFlag(HasSectionHashEntryBit, !S.empty());
 }
 
 bool GlobalValue::isDeclaration() const {
diff --git a/lib/IR/LLVMContextImpl.h b/lib/IR/LLVMContextImpl.h
index e9e30ef0656f..850c81cfabb2 100644
--- a/lib/IR/LLVMContextImpl.h
+++ b/lib/IR/LLVMContextImpl.h
@@ -26,6 +26,7 @@
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -1194,6 +1195,12 @@ public:
   /// Collection of per-GlobalObject metadata used in this context.
   DenseMap<const GlobalObject *, MDGlobalAttachmentMap> GlobalObjectMetadata;
 
+  /// Collection of per-GlobalObject sections used in this context.
+  DenseMap<const GlobalObject *, StringRef> GlobalObjectSections;
+
+  /// Stable collection of section strings.
+  StringSet<> SectionStrings;
+
   /// DiscriminatorTable - This table maps file:line locations to an
   /// integer representing the next DWARF path discriminator to assign to
   /// instructions in different blocks at the same location.
diff --git a/lib/LTO/LTOBackend.cpp b/lib/LTO/LTOBackend.cpp
index 6342cbe4fd90..809db80bc916 100644
--- a/lib/LTO/LTOBackend.cpp
+++ b/lib/LTO/LTOBackend.cpp
@@ -17,7 +17,6 @@
 #include "llvm/LTO/LTOBackend.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
@@ -36,6 +35,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/FunctionImportUtils.h"
 #include "llvm/Transforms/Utils/SplitModule.h"
 
diff --git a/lib/LTO/ThinLTOCodeGenerator.cpp b/lib/LTO/ThinLTOCodeGenerator.cpp
index 928f69a17de9..a14b86179d6e 100644
--- a/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -284,7 +284,8 @@ public:
       const FunctionImporter::ExportSetTy &ExportList,
       const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
       const GVSummaryMapTy &DefinedFunctions,
-      const DenseSet<GlobalValue::GUID> &PreservedSymbols) {
+      const DenseSet<GlobalValue::GUID> &PreservedSymbols, unsigned OptLevel,
+      const TargetMachineBuilder &TMBuilder) {
     if (CachePath.empty())
       return;
 
@@ -306,12 +307,42 @@ public:
 
     SHA1 Hasher;
 
+    // Include the parts of the LTO configuration that affect code generation.
+    auto AddString = [&](StringRef Str) {
+      Hasher.update(Str);
+      Hasher.update(ArrayRef<uint8_t>{0});
+    };
+    auto AddUnsigned = [&](unsigned I) {
+      uint8_t Data[4];
+      Data[0] = I;
+      Data[1] = I >> 8;
+      Data[2] = I >> 16;
+      Data[3] = I >> 24;
+      Hasher.update(ArrayRef<uint8_t>{Data, 4});
+    };
+
     // Start with the compiler revision
     Hasher.update(LLVM_VERSION_STRING);
 #ifdef HAVE_LLVM_REVISION
     Hasher.update(LLVM_REVISION);
 #endif
 
+    // Hash the optimization level and the target machine settings.
+    AddString(TMBuilder.MCpu);
+    // FIXME: Hash more of Options. For now all clients initialize Options from
+    // command-line flags (which is unsupported in production), but may set
+    // RelaxELFRelocations. The clang driver can also pass FunctionSections,
+    // DataSections and DebuggerTuning via command line flags.
+    AddUnsigned(TMBuilder.Options.RelaxELFRelocations);
+    AddUnsigned(TMBuilder.Options.FunctionSections);
+    AddUnsigned(TMBuilder.Options.DataSections);
+    AddUnsigned((unsigned)TMBuilder.Options.DebuggerTuning);
+    AddString(TMBuilder.MAttr);
+    if (TMBuilder.RelocModel)
+      AddUnsigned(*TMBuilder.RelocModel);
+    AddUnsigned(TMBuilder.CGOptLevel);
+    AddUnsigned(OptLevel);
+
     Hasher.update(ArrayRef<uint8_t>((uint8_t *)&ModHash[0], sizeof(ModHash)));
     for (auto F : ExportList)
       // The export list can impact the internalization, be conservative here
@@ -928,7 +959,8 @@ void ThinLTOCodeGenerator::run() {
         ModuleCacheEntry CacheEntry(CacheOptions.Path, *Index, ModuleIdentifier,
                                     ImportLists[ModuleIdentifier], ExportList,
                                     ResolvedODR[ModuleIdentifier],
-                                    DefinedFunctions, GUIDPreservedSymbols);
+                                    DefinedFunctions, GUIDPreservedSymbols,
+                                    OptLevel, TMBuilder);
         auto CacheEntryPath = CacheEntry.getEntryPath();
 
         {
diff --git a/lib/Object/CMakeLists.txt b/lib/Object/CMakeLists.txt
index f1a7c1a5ade9..b895c3fcc050 100644
--- a/lib/Object/CMakeLists.txt
+++ b/lib/Object/CMakeLists.txt
@@ -3,6 +3,7 @@ add_llvm_library(LLVMObject
   ArchiveWriter.cpp
   Binary.cpp
   COFFObjectFile.cpp
+  Decompressor.cpp
   ELF.cpp
   ELFObjectFile.cpp
   Error.cpp
diff --git a/lib/Object/Decompressor.cpp b/lib/Object/Decompressor.cpp
new file mode 100644
index 000000000000..bca41fd9f487
--- /dev/null
+++ b/lib/Object/Decompressor.cpp
@@ -0,0 +1,102 @@
+//===-- Decompressor.cpp --------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/Decompressor.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Support/Compression.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/ELF.h"
+
+using namespace llvm;
+using namespace llvm::support::endian;
+using namespace object;
+
+Expected<Decompressor> Decompressor::create(StringRef Name, StringRef Data,
+                                            bool IsLE, bool Is64Bit) {
+  if (!zlib::isAvailable())
+    return createError("zlib is not available");
+
+  Decompressor D(Data);
+  Error Err = isGnuStyle(Name) ? D.consumeCompressedGnuHeader()
+                               : D.consumeCompressedZLibHeader(Is64Bit, IsLE);
+  if (Err)
+    return std::move(Err);
+  return D;
+}
+
+Decompressor::Decompressor(StringRef Data)
+    : SectionData(Data), DecompressedSize(0) {}
+
+Error Decompressor::consumeCompressedGnuHeader() {
+  if (!SectionData.startswith("ZLIB"))
+    return createError("corrupted compressed section header");
+
+  SectionData = SectionData.substr(4);
+
+  // Consume uncompressed section size (big-endian 8 bytes).
+  if (SectionData.size() < 8)
+    return createError("corrupted uncompressed section size");
+  DecompressedSize = read64be(SectionData.data());
+  SectionData = SectionData.substr(8);
+
+  return Error::success();
+}
+
+Error Decompressor::consumeCompressedZLibHeader(bool Is64Bit,
+                                                bool IsLittleEndian) {
+  using namespace ELF;
+  uint64_t HdrSize = Is64Bit ? sizeof(Elf64_Chdr) : sizeof(Elf32_Chdr);
+  if (SectionData.size() < HdrSize)
+    return createError("corrupted compressed section header");
+
+  DataExtractor Extractor(SectionData, IsLittleEndian, 0);
+  uint32_t Offset = 0;
+  if (Extractor.getUnsigned(&Offset, Is64Bit ? sizeof(Elf64_Word)
+                                             : sizeof(Elf32_Word)) !=
+      ELFCOMPRESS_ZLIB)
+    return createError("unsupported compression type");
+
+  // Skip Elf64_Chdr::ch_reserved field.
+  if (Is64Bit)
+    Offset += sizeof(Elf64_Word);
+
+  DecompressedSize = Extractor.getUnsigned(
+      &Offset, Is64Bit ? sizeof(Elf64_Xword) : sizeof(Elf32_Word));
+  SectionData = SectionData.substr(HdrSize);
+  return Error::success();
+}
+
+bool Decompressor::isGnuStyle(StringRef Name) {
+  return Name.startswith(".zdebug");
+}
+
+bool Decompressor::isCompressed(const object::SectionRef &Section) {
+  StringRef Name;
+  if (Section.getName(Name))
+    return false;
+  return Section.isCompressed() || isGnuStyle(Name);
+}
+
+bool Decompressor::isCompressedELFSection(uint64_t Flags, StringRef Name) {
+  return (Flags & ELF::SHF_COMPRESSED) || isGnuStyle(Name);
+}
+
+Error Decompressor::decompress(SmallString<32> &Out) {
+  Out.resize(DecompressedSize);
+  return decompress({Out.data(), (size_t)DecompressedSize});
+}
+
+Error Decompressor::decompress(MutableArrayRef<char> Buffer) {
+  size_t Size = Buffer.size();
+  zlib::Status Status = zlib::uncompress(SectionData, Buffer.data(), Size);
+  if (Status != zlib::StatusOK)
+    return createError("decompression failed");
+  return Error::success();
+}
diff --git a/lib/ObjectYAML/DWARFYAML.cpp b/lib/ObjectYAML/DWARFYAML.cpp
index 42a448a7bdfd..014e63fe7d34 100644
--- a/lib/ObjectYAML/DWARFYAML.cpp
+++ b/lib/ObjectYAML/DWARFYAML.cpp
@@ -27,17 +27,18 @@ void MappingTraits<DWARFYAML::Data>::mapping(IO &IO, DWARFYAML::Data &DWARF) {
   IO.setContext(&DWARF);
   IO.mapOptional("debug_str", DWARF.DebugStrings);
   IO.mapOptional("debug_abbrev", DWARF.AbbrevDecls);
-  if(!DWARF.ARanges.empty() || !IO.outputting())
+  if (!DWARF.ARanges.empty() || !IO.outputting())
     IO.mapOptional("debug_aranges", DWARF.ARanges);
-  if(!DWARF.PubNames.Entries.empty() || !IO.outputting())
+  if (!DWARF.PubNames.Entries.empty() || !IO.outputting())
     IO.mapOptional("debug_pubnames", DWARF.PubNames);
-  if(!DWARF.PubTypes.Entries.empty() || !IO.outputting())
+  if (!DWARF.PubTypes.Entries.empty() || !IO.outputting())
     IO.mapOptional("debug_pubtypes", DWARF.PubTypes);
-  if(!DWARF.GNUPubNames.Entries.empty() || !IO.outputting())
+  if (!DWARF.GNUPubNames.Entries.empty() || !IO.outputting())
     IO.mapOptional("debug_gnu_pubnames", DWARF.GNUPubNames);
-  if(!DWARF.GNUPubTypes.Entries.empty() || !IO.outputting())
+  if (!DWARF.GNUPubTypes.Entries.empty() || !IO.outputting())
     IO.mapOptional("debug_gnu_pubtypes", DWARF.GNUPubTypes);
   IO.mapOptional("debug_info", DWARF.CompileUnits);
+  IO.mapOptional("debug_line", DWARF.DebugLines);
   IO.setContext(&oldContext);
 }
 
@@ -62,7 +63,7 @@ void MappingTraits<DWARFYAML::ARangeDescriptor>::mapping(
 }
 
 void MappingTraits<DWARFYAML::ARange>::mapping(IO &IO,
-                                                DWARFYAML::ARange &Range) {
+                                               DWARFYAML::ARange &Range) {
   IO.mapRequired("Length", Range.Length);
   IO.mapRequired("Version", Range.Version);
   IO.mapRequired("CuOffset", Range.CuOffset);
@@ -106,15 +107,61 @@ void MappingTraits<DWARFYAML::Entry>::mapping(IO &IO, DWARFYAML::Entry &Entry) {
   IO.mapRequired("Values", Entry.Values);
 }
 
-void MappingTraits<DWARFYAML::FormValue>::mapping(IO &IO,
-                                             DWARFYAML::FormValue &FormValue) {
+void MappingTraits<DWARFYAML::FormValue>::mapping(
+    IO &IO, DWARFYAML::FormValue &FormValue) {
   IO.mapOptional("Value", FormValue.Value);
-  if(!FormValue.CStr.empty() || !IO.outputting())
+  if (!FormValue.CStr.empty() || !IO.outputting())
     IO.mapOptional("CStr", FormValue.CStr);
-  if(!FormValue.BlockData.empty() || !IO.outputting())
+  if (!FormValue.BlockData.empty() || !IO.outputting())
     IO.mapOptional("BlockData", FormValue.BlockData);
 }
 
+void MappingTraits<DWARFYAML::File>::mapping(IO &IO, DWARFYAML::File &File) {
+  IO.mapRequired("Name", File.Name);
+  IO.mapRequired("DirIdx", File.DirIdx);
+  IO.mapRequired("ModTime", File.ModTime);
+  IO.mapRequired("Length", File.Length);
+}
+
+void MappingTraits<DWARFYAML::LineTableOpcode>::mapping(
+    IO &IO, DWARFYAML::LineTableOpcode &LineTableOpcode) {
+  IO.mapRequired("Opcode", LineTableOpcode.Opcode);
+  if (LineTableOpcode.Opcode == dwarf::DW_LNS_extended_op) {
+    IO.mapRequired("ExtLen", LineTableOpcode.ExtLen);
+    IO.mapRequired("SubOpcode", LineTableOpcode.SubOpcode);
+  }
+
+  if (!LineTableOpcode.UnknownOpcodeData.empty() || !IO.outputting())
+    IO.mapOptional("UnknownOpcodeData", LineTableOpcode.UnknownOpcodeData);
+  if (!LineTableOpcode.UnknownOpcodeData.empty() || !IO.outputting())
+    IO.mapOptional("StandardOpcodeData", LineTableOpcode.StandardOpcodeData);
+  if (!LineTableOpcode.FileEntry.Name.empty() || !IO.outputting())
+    IO.mapOptional("FileEntry", LineTableOpcode.FileEntry);
+  if (LineTableOpcode.Opcode == dwarf::DW_LNS_advance_line || !IO.outputting())
+    IO.mapOptional("SData", LineTableOpcode.SData);
+  IO.mapOptional("Data", LineTableOpcode.Data);
+}
+
+void MappingTraits<DWARFYAML::LineTable>::mapping(
+    IO &IO, DWARFYAML::LineTable &LineTable) {
+  IO.mapRequired("TotalLength", LineTable.TotalLength);
+  if (LineTable.TotalLength == UINT32_MAX)
+    IO.mapRequired("TotalLength64", LineTable.TotalLength64);
+  IO.mapRequired("Version", LineTable.Version);
+  IO.mapRequired("PrologueLength", LineTable.PrologueLength);
+  IO.mapRequired("MinInstLength", LineTable.MinInstLength);
+  if(LineTable.Version >= 4)
+    IO.mapRequired("MaxOpsPerInst", LineTable.MaxOpsPerInst);
+  IO.mapRequired("DefaultIsStmt", LineTable.DefaultIsStmt);
+  IO.mapRequired("LineBase", LineTable.LineBase);
+  IO.mapRequired("LineRange", LineTable.LineRange);
+  IO.mapRequired("OpcodeBase", LineTable.OpcodeBase);
+  IO.mapRequired("StandardOpcodeLengths", LineTable.StandardOpcodeLengths);
+  IO.mapRequired("IncludeDirs", LineTable.IncludeDirs);
+  IO.mapRequired("Files", LineTable.Files);
+  IO.mapRequired("Opcodes", LineTable.Opcodes);
+}
+
 } // namespace llvm::yaml
 
 } // namespace llvm
diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp
index 6e0aae5fd852..2994a07b1ccf 100644
--- a/lib/Passes/PassBuilder.cpp
+++ b/lib/Passes/PassBuilder.cpp
@@ -94,14 +94,17 @@
 #include "llvm/Transforms/Scalar/Float2Int.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Scalar/GuardWidening.h"
+#include "llvm/Transforms/Scalar/IVUsersPrinter.h"
 #include "llvm/Transforms/Scalar/IndVarSimplify.h"
 #include "llvm/Transforms/Scalar/JumpThreading.h"
 #include "llvm/Transforms/Scalar/LICM.h"
+#include "llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h"
 #include "llvm/Transforms/Scalar/LoopDataPrefetch.h"
 #include "llvm/Transforms/Scalar/LoopDeletion.h"
 #include "llvm/Transforms/Scalar/LoopDistribute.h"
 #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
 #include "llvm/Transforms/Scalar/LoopInstSimplify.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Scalar/LoopRotation.h"
 #include "llvm/Transforms/Scalar/LoopSimplifyCFG.h"
 #include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
@@ -220,7 +223,8 @@ public:
 
 /// \brief No-op loop pass which does nothing.
 struct NoOpLoopPass {
-  PreservedAnalyses run(Loop &L, LoopAnalysisManager &) {
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &,
+                        LoopStandardAnalysisResults &, LPMUpdater &) {
     return PreservedAnalyses::all();
   }
   static StringRef name() { return "NoOpLoopPass"; }
@@ -233,7 +237,9 @@ class NoOpLoopAnalysis : public AnalysisInfoMixin<NoOpLoopAnalysis> {
 
 public:
   struct Result {};
-  Result run(Loop &, LoopAnalysisManager &) { return Result(); }
+  Result run(Loop &, LoopAnalysisManager &, LoopStandardAnalysisResults &) {
+    return Result();
+  }
   static StringRef name() { return "NoOpLoopAnalysis"; }
 };
 
@@ -1019,7 +1025,9 @@ bool PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
 #define LOOP_ANALYSIS(NAME, CREATE_PASS)                                       \
   if (Name == "require<" NAME ">") {                                           \
     LPM.addPass(RequireAnalysisPass<                                           \
-                std::remove_reference<decltype(CREATE_PASS)>::type, Loop>());  \
+                std::remove_reference<decltype(CREATE_PASS)>::type, Loop,      \
+                LoopAnalysisManager, LoopStandardAnalysisResults &,            \
+                LPMUpdater &>());                                              \
     return true;                                                               \
   }                                                                            \
   if (Name == "invalidate<" NAME ">") {                                        \
diff --git a/lib/ProfileData/InstrProf.cpp b/lib/ProfileData/InstrProf.cpp
index 77c6ffc9c253..74acd9e5e207 100644
--- a/lib/ProfileData/InstrProf.cpp
+++ b/lib/ProfileData/InstrProf.cpp
@@ -811,4 +811,47 @@ bool needsComdatForCounter(const Function &F, const Module &M) {
 
   return true;
 }
+
+// Check if INSTR_PROF_RAW_VERSION_VAR is defined.
+bool isIRPGOFlagSet(const Module *M) {
+  auto IRInstrVar =
+      M->getNamedGlobal(INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR));
+  if (!IRInstrVar || IRInstrVar->isDeclaration() ||
+      IRInstrVar->hasLocalLinkage())
+    return false;
+
+  // Check if the flag is set.
+  if (!IRInstrVar->hasInitializer())
+    return false;
+
+  const Constant *InitVal = IRInstrVar->getInitializer();
+  if (!InitVal)
+    return false;
+
+  return (dyn_cast<ConstantInt>(InitVal)->getZExtValue() &
+          VARIANT_MASK_IR_PROF) != 0;
+}
+
+// Check if we can safely rename this Comdat function.
+bool canRenameComdatFunc(const Function &F, bool CheckAddressTaken) {
+  if (F.getName().empty())
+    return false;
+  if (!needsComdatForCounter(F, *(F.getParent())))
+    return false;
+  // Unsafe to rename the address-taken function (which can be used in
+  // function comparison).
+  if (CheckAddressTaken && F.hasAddressTaken())
+    return false;
+  // Only safe to do if this function may be discarded if it is not used
+  // in the compilation unit.
+  if (!GlobalValue::isDiscardableIfUnused(F.getLinkage()))
+    return false;
+
+  // For AvailableExternallyLinkage functions.
+  if (!F.hasComdat()) {
+    assert(F.getLinkage() == GlobalValue::AvailableExternallyLinkage);
+    return true;
+  }
+  return true;
+}
 } // end namespace llvm
diff --git a/lib/Support/FileOutputBuffer.cpp b/lib/Support/FileOutputBuffer.cpp
index 2c7bf0435d88..57e5a8d7871c 100644
--- a/lib/Support/FileOutputBuffer.cpp
+++ b/lib/Support/FileOutputBuffer.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/Errc.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/Signals.h"
 #include <system_error>
 
@@ -28,8 +29,10 @@ using llvm::sys::fs::mapped_file_region;
 
 namespace llvm {
 FileOutputBuffer::FileOutputBuffer(std::unique_ptr<mapped_file_region> R,
-                                   StringRef Path, StringRef TmpPath)
-    : Region(std::move(R)), FinalPath(Path), TempPath(TmpPath) {}
+                                   StringRef Path, StringRef TmpPath,
+                                   bool IsRegular)
+    : Region(std::move(R)), FinalPath(Path), TempPath(TmpPath),
+      IsRegular(IsRegular) {}
 
 FileOutputBuffer::~FileOutputBuffer() {
   // Close the mapping before deleting the temp file, so that the removal
@@ -40,9 +43,10 @@ FileOutputBuffer::~FileOutputBuffer() {
 
 ErrorOr<std::unique_ptr<FileOutputBuffer>>
 FileOutputBuffer::create(StringRef FilePath, size_t Size, unsigned Flags) {
-  // If file already exists, it must be a regular file (to be mappable).
+  // Check file is not a regular file, in which case we cannot remove it.
   sys::fs::file_status Stat;
   std::error_code EC = sys::fs::status(FilePath, Stat);
+  bool IsRegular = true;
   switch (Stat.type()) {
     case sys::fs::file_type::file_not_found:
       // If file does not exist, we'll create one.
@@ -56,25 +60,34 @@ FileOutputBuffer::create(StringRef FilePath, size_t Size, unsigned Flags) {
     default:
       if (EC)
         return EC;
-      else
-        return make_error_code(errc::operation_not_permitted);
+      IsRegular = false;
   }
 
-  // Delete target file.
-  EC = sys::fs::remove(FilePath);
-  if (EC)
-    return EC;
-
-  unsigned Mode = sys::fs::all_read | sys::fs::all_write;
-  // If requested, make the output file executable.
-  if (Flags & F_executable)
-    Mode |= sys::fs::all_exe;
+  if (IsRegular) {
+    // Delete target file.
+    EC = sys::fs::remove(FilePath);
+    if (EC)
+      return EC;
+  }
 
-  // Create new file in same directory but with random name.
   SmallString<128> TempFilePath;
   int FD;
-  EC = sys::fs::createUniqueFile(Twine(FilePath) + ".tmp%%%%%%%", FD,
-                                 TempFilePath, Mode);
+  if (IsRegular) {
+    unsigned Mode = sys::fs::all_read | sys::fs::all_write;
+    // If requested, make the output file executable.
+    if (Flags & F_executable)
+      Mode |= sys::fs::all_exe;
+    // Create new file in same directory but with random name.
+    EC = sys::fs::createUniqueFile(Twine(FilePath) + ".tmp%%%%%%%", FD,
+                                   TempFilePath, Mode);
+  } else {
+    // Create a temporary file. Since this is a special file, we will not move
+    // it and the new file can be in another filesystem. This avoids trying to
+    // create a temporary file in /dev when outputting to /dev/null for example.
+    EC = sys::fs::createTemporaryFile(sys::path::filename(FilePath), "", FD,
+                                      TempFilePath);
+  }
+
   if (EC)
     return EC;
 
@@ -99,8 +112,8 @@ FileOutputBuffer::create(StringRef FilePath, size_t Size, unsigned Flags) {
   if (Ret)
     return std::error_code(errno, std::generic_category());
 
-  std::unique_ptr<FileOutputBuffer> Buf(
-      new FileOutputBuffer(std::move(MappedFile), FilePath, TempFilePath));
+  std::unique_ptr<FileOutputBuffer> Buf(new FileOutputBuffer(
+      std::move(MappedFile), FilePath, TempFilePath, IsRegular));
   return std::move(Buf);
 }
 
@@ -108,10 +121,19 @@ std::error_code FileOutputBuffer::commit() {
   // Unmap buffer, letting OS flush dirty pages to file on disk.
   Region.reset();
 
+  std::error_code EC;
+  if (IsRegular) {
+    // Rename file to final name.
+    EC = sys::fs::rename(Twine(TempPath), Twine(FinalPath));
+    sys::DontRemoveFileOnSignal(TempPath);
+  } else {
+    EC = sys::fs::copy_file(TempPath, FinalPath);
+    std::error_code RMEC = sys::fs::remove(TempPath);
+    sys::DontRemoveFileOnSignal(TempPath);
+    if (RMEC)
+      return RMEC;
+  }
 
-  // Rename file to final name.
-  std::error_code EC = sys::fs::rename(Twine(TempPath), Twine(FinalPath));
-  sys::DontRemoveFileOnSignal(TempPath);
   return EC;
 }
 } // namespace
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index 8a09589aa884..d1b40412a6fc 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -111,6 +111,7 @@ enum ProcessorTypes {
   AMDATHLON,
   AMDFAM14H,
   AMDFAM16H,
+  AMDFAM17H,
   CPU_TYPE_MAX
 };
 
@@ -149,6 +150,7 @@ enum ProcessorSubtypes {
   AMD_BTVER2,
   AMDFAM15H_BDVER3,
   AMDFAM15H_BDVER4,
+  AMDFAM17H_ZNVER1,
   CPU_SUBTYPE_MAX
 };
 
@@ -742,6 +744,14 @@ static void getAMDProcessorTypeAndSubtype(unsigned int Family,
     }
     *Subtype = AMD_BTVER2;
     break; // "btver2"
+  case 23:
+    *Type = AMDFAM17H;
+    if (Features & (1 << FEATURE_ADX)) {
+      *Subtype = AMDFAM17H_ZNVER1;
+      break; // "znver1"
+    }
+    *Subtype =  AMD_BTVER1;
+    break;
   default:
     break; // "generic"
   }
@@ -950,6 +960,15 @@ StringRef sys::getHostCPUName() {
       default:
         return "amdfam16";
       }
+    case AMDFAM17H:
+      switch (Subtype) {
+      case AMD_BTVER1:
+        return "btver1";
+      case AMDFAM17H_ZNVER1:
+        return "znver1";
+      default:
+        return "amdfam17";
+      }
     default:
       return "generic";
     }
diff --git a/lib/Support/TarWriter.cpp b/lib/Support/TarWriter.cpp
index f79b364dc1f7..f06abf46cce4 100644
--- a/lib/Support/TarWriter.cpp
+++ b/lib/Support/TarWriter.cpp
@@ -54,6 +54,13 @@ struct UstarHeader {
 };
 static_assert(sizeof(UstarHeader) == BlockSize, "invalid Ustar header");
 
+static UstarHeader makeUstarHeader() {
+  UstarHeader Hdr = {};
+  memcpy(Hdr.Magic, "ustar", 5); // Ustar magic
+  memcpy(Hdr.Version, "00", 2);  // Ustar version
+  return Hdr;
+}
+
 // A PAX attribute is in the form of "<length> <key>=<value>\n"
 // where <length> is the length of the entire string including
 // the length field itself. An example string is this.
@@ -98,10 +105,9 @@ static void writePaxHeader(raw_fd_ostream &OS, StringRef Path) {
   std::string PaxAttr = formatPax("path", Path);
 
   // Create a 512-byte header.
-  UstarHeader Hdr = {};
+  UstarHeader Hdr = makeUstarHeader();
   snprintf(Hdr.Size, sizeof(Hdr.Size), "%011zo", PaxAttr.size());
-  Hdr.TypeFlag = 'x';            // PAX magic
-  memcpy(Hdr.Magic, "ustar", 6); // Ustar magic
+  Hdr.TypeFlag = 'x'; // PAX magic
   computeChecksum(Hdr);
 
   // Write them down.
@@ -116,7 +122,7 @@ static void writePaxHeader(raw_fd_ostream &OS, StringRef Path) {
 static std::pair<StringRef, StringRef> splitPath(StringRef Path) {
   if (Path.size() <= sizeof(UstarHeader::Name))
     return {"", Path};
-  size_t Sep = Path.rfind('/', sizeof(UstarHeader::Name) + 1);
+  size_t Sep = Path.rfind('/', sizeof(UstarHeader::Prefix) + 1);
   if (Sep == StringRef::npos)
     return {"", Path};
   return {Path.substr(0, Sep), Path.substr(Sep + 1)};
@@ -138,11 +144,10 @@ static void writeUstarHeader(raw_fd_ostream &OS, StringRef Path, size_t Size) {
   StringRef Name;
   std::tie(Prefix, Name) = splitPath(Path);
 
-  UstarHeader Hdr = {};
+  UstarHeader Hdr = makeUstarHeader();
   memcpy(Hdr.Name, Name.data(), Name.size());
   memcpy(Hdr.Mode, "0000664", 8);
   snprintf(Hdr.Size, sizeof(Hdr.Size), "%011zo", Size);
-  memcpy(Hdr.Magic, "ustar", 6);
   memcpy(Hdr.Prefix, Prefix.data(), Prefix.size());
   computeChecksum(Hdr);
   OS << StringRef(reinterpret_cast<char *>(&Hdr), sizeof(Hdr));
diff --git a/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
index e927d58ad612..d472a54d9543 100644
--- a/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
+++ b/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
@@ -18,9 +18,132 @@
 namespace llvm {
 namespace AArch64 {
 
-RegisterBank GPRRegBank;
-RegisterBank FPRRegBank;
-RegisterBank CCRRegBank;
+const uint32_t GPRCoverageData[] = {
+    // Classes 0-31
+    (1u << AArch64::GPR32allRegClassID) | (1u << AArch64::GPR32RegClassID) |
+        (1u << AArch64::GPR32spRegClassID) |
+        (1u << AArch64::GPR32commonRegClassID) |
+        (1u << AArch64::GPR32sponlyRegClassID) |
+        (1u << AArch64::GPR64allRegClassID) | (1u << AArch64::GPR64RegClassID) |
+        (1u << AArch64::GPR64spRegClassID) |
+        (1u << AArch64::GPR64commonRegClassID) |
+        (1u << AArch64::tcGPR64RegClassID) |
+        (1u << AArch64::GPR64sponlyRegClassID),
+    // Classes 32-63
+    0,
+    // FIXME: The entries below this point can be safely removed once this is
+    // tablegenerated. It's only needed because of the hardcoded register class
+    // limit.
+    // Classes 64-96
+    0,
+    // Classes 97-128
+    0,
+    // Classes 129-160
+    0,
+    // Classes 161-192
+    0,
+    // Classes 193-224
+    0,
+};
+
+const uint32_t FPRCoverageData[] = {
+    // Classes 0-31
+    (1u << AArch64::FPR8RegClassID) | (1u << AArch64::FPR16RegClassID) |
+        (1u << AArch64::FPR32RegClassID) | (1u << AArch64::FPR64RegClassID) |
+        (1u << AArch64::DDRegClassID) | (1u << AArch64::FPR128RegClassID) |
+        (1u << AArch64::FPR128_loRegClassID) | (1u << AArch64::DDDRegClassID) |
+        (1u << AArch64::DDDDRegClassID),
+    // Classes 32-63
+    (1u << (AArch64::QQRegClassID - 32)) |
+        (1u << (AArch64::QQ_with_qsub0_in_FPR128_loRegClassID - 32)) |
+        (1u << (AArch64::QQ_with_qsub1_in_FPR128_loRegClassID - 32)) |
+        (1u
+         << (AArch64::
+                 QQQ_with_qsub1_in_FPR128_lo_and_QQQ_with_qsub2_in_FPR128_loRegClassID -
+             32)) |
+        (1u
+         << (AArch64::
+                 QQQ_with_qsub0_in_FPR128_lo_and_QQQ_with_qsub2_in_FPR128_loRegClassID -
+             32)) |
+        (1u << (AArch64::QQQQRegClassID - 32)) |
+        (1u << (AArch64::QQQQ_with_qsub0_in_FPR128_loRegClassID - 32)) |
+        (1u << (AArch64::QQQQ_with_qsub1_in_FPR128_loRegClassID - 32)) |
+        (1u << (AArch64::QQQQ_with_qsub2_in_FPR128_loRegClassID - 32)) |
+        (1u << (AArch64::QQQQ_with_qsub3_in_FPR128_loRegClassID - 32)) |
+        (1u
+         << (AArch64::
+                 QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub1_in_FPR128_loRegClassID -
+             32)) |
+        (1u
+         << (AArch64::
+                 QQQQ_with_qsub1_in_FPR128_lo_and_QQQQ_with_qsub2_in_FPR128_loRegClassID -
+             32)) |
+        (1u
+         << (AArch64::
+                 QQQQ_with_qsub2_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID -
+             32)) |
+        (1u
+         << (AArch64::
+                 QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub2_in_FPR128_loRegClassID -
+             32)) |
+        (1u
+         << (AArch64::
+                 QQQQ_with_qsub1_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID -
+             32)) |
+        (1u
+         << (AArch64::
+                 QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID -
+             32)) |
+        (1u
+         << (AArch64::
+                 QQ_with_qsub0_in_FPR128_lo_and_QQ_with_qsub1_in_FPR128_loRegClassID -
+             32)) |
+        (1u << (AArch64::QQQRegClassID - 32)) |
+        (1u << (AArch64::QQQ_with_qsub0_in_FPR128_loRegClassID - 32)) |
+        (1u << (AArch64::QQQ_with_qsub1_in_FPR128_loRegClassID - 32)) |
+        (1u << (AArch64::QQQ_with_qsub2_in_FPR128_loRegClassID - 32)) |
+        (1u
+         << (AArch64::
+                 QQQ_with_qsub0_in_FPR128_lo_and_QQQ_with_qsub1_in_FPR128_loRegClassID -
+             32)),
+    // FIXME: The entries below this point can be safely removed once this
+    // is tablegenerated. It's only needed because of the hardcoded register
+    // class limit.
+    // Classes 64-96
+    0,
+    // Classes 97-128
+    0,
+    // Classes 129-160
+    0,
+    // Classes 161-192
+    0,
+    // Classes 193-224
+    0,
+};
+
+const uint32_t CCRCoverageData[] = {
+    // Classes 0-31
+    1u << AArch64::CCRRegClassID,
+    // Classes 32-63
+    0,
+    // FIXME: The entries below this point can be safely removed once this
+    // is tablegenerated. It's only needed because of the hardcoded register
+    // class limit.
+    // Classes 64-96
+    0,
+    // Classes 97-128
+    0,
+    // Classes 129-160
+    0,
+    // Classes 161-192
+    0,
+    // Classes 193-224
+    0,
+};
+
+RegisterBank GPRRegBank(AArch64::GPRRegBankID, "GPR", 64, GPRCoverageData);
+RegisterBank FPRRegBank(AArch64::FPRRegBankID, "FPR", 512, FPRCoverageData);
+RegisterBank CCRRegBank(AArch64::CCRRegBankID, "CCR", 32, CCRCoverageData);
 
 RegisterBank *RegBanks[] = {&GPRRegBank, &FPRRegBank, &CCRRegBank};
 
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 74a01835171b..7b581a706fa2 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -159,6 +159,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SETCC, MVT::i64, Custom);
   setOperationAction(ISD::SETCC, MVT::f32, Custom);
   setOperationAction(ISD::SETCC, MVT::f64, Custom);
+  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+  setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
   setOperationAction(ISD::BRCOND, MVT::Other, Expand);
   setOperationAction(ISD::BR_CC, MVT::i32, Custom);
   setOperationAction(ISD::BR_CC, MVT::i64, Custom);
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index c5b95f282ea8..2244baacca17 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -951,10 +951,7 @@ def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>;
 
 defm CLS    : OneOperandData<0b101, "cls">;
 defm CLZ    : OneOperandData<0b100, "clz", ctlz>;
-defm RBIT   : OneOperandData<0b000, "rbit">;
-
-def : Pat<(int_aarch64_rbit GPR32:$Rn), (RBITWr $Rn)>;
-def : Pat<(int_aarch64_rbit GPR64:$Rn), (RBITXr $Rn)>;
+defm RBIT   : OneOperandData<0b000, "rbit", bitreverse>;
 
 def  REV16Wr : OneWRegData<0b001, "rev16",
                                   UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>;
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
index a5fd2fbdde19..b292c9c87dcd 100644
--- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -41,28 +41,30 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
   if (AlreadyInit)
     return;
   AlreadyInit = true;
-  // Initialize the GPR bank.
-  createRegisterBank(AArch64::GPRRegBankID, "GPR");
-  // The GPR register bank is fully defined by all the registers in
-  // GR64all + its subclasses.
-  addRegBankCoverage(AArch64::GPRRegBankID, AArch64::GPR64allRegClassID, TRI);
+
   const RegisterBank &RBGPR = getRegBank(AArch64::GPRRegBankID);
   (void)RBGPR;
   assert(&AArch64::GPRRegBank == &RBGPR &&
          "The order in RegBanks is messed up");
+
+  const RegisterBank &RBFPR = getRegBank(AArch64::FPRRegBankID);
+  (void)RBFPR;
+  assert(&AArch64::FPRRegBank == &RBFPR &&
+         "The order in RegBanks is messed up");
+
+  const RegisterBank &RBCCR = getRegBank(AArch64::CCRRegBankID);
+  (void)RBCCR;
+  assert(&AArch64::CCRRegBank == &RBCCR &&
+         "The order in RegBanks is messed up");
+
+  // The GPR register bank is fully defined by all the registers in
+  // GR64all + its subclasses.
   assert(RBGPR.covers(*TRI.getRegClass(AArch64::GPR32RegClassID)) &&
          "Subclass not added?");
   assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit");
 
-  // Initialize the FPR bank.
-  createRegisterBank(AArch64::FPRRegBankID, "FPR");
   // The FPR register bank is fully defined by all the registers in
   // GR64all + its subclasses.
-  addRegBankCoverage(AArch64::FPRRegBankID, AArch64::QQQQRegClassID, TRI);
-  const RegisterBank &RBFPR = getRegBank(AArch64::FPRRegBankID);
-  (void)RBFPR;
-  assert(&AArch64::FPRRegBank == &RBFPR &&
-         "The order in RegBanks is messed up");
   assert(RBFPR.covers(*TRI.getRegClass(AArch64::QQRegClassID)) &&
          "Subclass not added?");
   assert(RBFPR.covers(*TRI.getRegClass(AArch64::FPR64RegClassID)) &&
@@ -70,13 +72,6 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
   assert(RBFPR.getSize() == 512 &&
          "FPRs should hold up to 512-bit via QQQQ sequence");
 
-  // Initialize the CCR bank.
-  createRegisterBank(AArch64::CCRRegBankID, "CCR");
-  addRegBankCoverage(AArch64::CCRRegBankID, AArch64::CCRRegClassID, TRI);
-  const RegisterBank &RBCCR = getRegBank(AArch64::CCRRegBankID);
-  (void)RBCCR;
-  assert(&AArch64::CCRRegBank == &RBCCR &&
-         "The order in RegBanks is messed up");
   assert(RBCCR.covers(*TRI.getRegClass(AArch64::CCRRegClassID)) &&
          "Class not added?");
   assert(RBCCR.getSize() == 32 && "CCR should hold up to 32-bit");
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 1a17691fc584..b8833e5a5552 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -374,7 +374,7 @@ int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
 int AArch64TTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
-    TTI::OperandValueProperties Opd2PropInfo) {
+    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
@@ -466,28 +466,27 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
 }
 
-int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
                                     unsigned Alignment, unsigned AddressSpace) {
-  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+  auto LT = TLI->getTypeLegalizationCost(DL, Ty);
 
   if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
-      Src->isVectorTy() && Alignment != 16 &&
-      Src->getVectorElementType()->isIntegerTy(64)) {
-    // Unaligned stores are extremely inefficient. We don't split
-    // unaligned v2i64 stores because the negative impact that has shown in
-    // practice on inlined memcpy code.
-    // We make v2i64 stores expensive so that we will only vectorize if there
+      LT.second.is128BitVector() && Alignment < 16) {
+    // Unaligned stores are extremely inefficient. We don't split all
+    // unaligned 128-bit stores because the negative impact that has shown in
+    // practice on inlined block copy code.
+    // We make such stores expensive so that we will only vectorize if there
     // are 6 other instructions getting vectorized.
-    int AmortizationCost = 6;
+    const int AmortizationCost = 6;
 
     return LT.first * 2 * AmortizationCost;
   }
 
-  if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) &&
-      Src->getVectorNumElements() < 8) {
+  if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8) &&
+      Ty->getVectorNumElements() < 8) {
     // We scalarize the loads/stores because there is not v.4b register and we
     // have to promote the elements to v.4h.
-    unsigned NumVecElts = Src->getVectorNumElements();
+    unsigned NumVecElts = Ty->getVectorNumElements();
     unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
     // We generate 2 instructions per vector element.
     return NumVectorizableInstsToAmortize * NumVecElts * 2;
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 849fd3d9b44a..18287ed6653f 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -102,7 +102,8 @@ public:
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
-      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
 
   int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr);
 
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 730bcdcf7afa..e48c1943cb01 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -434,6 +434,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 
   setSchedulingPreference(Sched::RegPressure);
   setJumpIsExpensive(true);
+
+  // FIXME: This is only partially true. If we have to do vector compares, any
+  // SGPR pair can be a condition register. If we have a uniform condition, we
+  // are better off doing SALU operations, where there is only one SCC. For now,
+  // we don't have a way of knowing during instruction selection if a condition
+  // will be uniform and we always use vector compares. Assume we are using
+  // vector compares until that is fixed.
   setHasMultipleConditionRegisters(true);
 
   // SI at least has hardware support for floating point exceptions, but no way
@@ -470,12 +477,31 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::STORE);
   setTargetDAGCombine(ISD::FADD);
   setTargetDAGCombine(ISD::FSUB);
+  setTargetDAGCombine(ISD::FNEG);
 }
 
 //===----------------------------------------------------------------------===//
 // Target Information
 //===----------------------------------------------------------------------===//
 
+static bool fnegFoldsIntoOp(unsigned Opc) {
+  switch (Opc) {
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::FMA:
+  case ISD::FMAD:
+  case ISD::FSIN:
+  case AMDGPUISD::RCP:
+  case AMDGPUISD::RCP_LEGACY:
+  case AMDGPUISD::SIN_HW:
+  case AMDGPUISD::FMUL_LEGACY:
+    return true;
+  default:
+    return false;
+  }
+}
+
 MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
   return MVT::i32;
 }
@@ -2679,8 +2705,93 @@ SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond,
   return SDValue();
 }
 
+static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
+                                         unsigned Op,
+                                         const SDLoc &SL,
+                                         SDValue Cond,
+                                         SDValue N1,
+                                         SDValue N2) {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N1.getValueType();
+
+  SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
+                                  N1.getOperand(0), N2.getOperand(0));
+  DCI.AddToWorklist(NewSelect.getNode());
+  return DAG.getNode(Op, SL, VT, NewSelect);
+}
+
+// Pull a free FP operation out of a select so it may fold into uses.
+//
+// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
+// select c, (fneg x), k -> fneg (select c, x, (fneg k))
+//
+// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
+// select c, (fabs x), +k -> fabs (select c, x, k)
+static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
+                                    SDValue N) {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Cond = N.getOperand(0);
+  SDValue LHS = N.getOperand(1);
+  SDValue RHS = N.getOperand(2);
+
+  EVT VT = N.getValueType();
+  if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
+      (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
+    return distributeOpThroughSelect(DCI, LHS.getOpcode(),
+                                     SDLoc(N), Cond, LHS, RHS);
+  }
+
+  bool Inv = false;
+  if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
+    std::swap(LHS, RHS);
+    Inv = true;
+  }
+
+  // TODO: Support vector constants.
+  ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
+  if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
+    SDLoc SL(N);
+    // If one side is an fneg/fabs and the other is a constant, we can push the
+    // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
+    SDValue NewLHS = LHS.getOperand(0);
+    SDValue NewRHS = RHS;
+
+    // Careful: if the neg can be folded up, don't try to pull it back down.
+    bool ShouldFoldNeg = true;
+
+    if (NewLHS.hasOneUse()) {
+      unsigned Opc = NewLHS.getOpcode();
+      if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
+        ShouldFoldNeg = false;
+      if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
+        ShouldFoldNeg = false;
+    }
+
+    if (ShouldFoldNeg) {
+      if (LHS.getOpcode() == ISD::FNEG)
+        NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+      else if (CRHS->isNegative())
+        return SDValue();
+
+      if (Inv)
+        std::swap(NewLHS, NewRHS);
+
+      SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
+                                      Cond, NewLHS, NewRHS);
+      DCI.AddToWorklist(NewSelect.getNode());
+      return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
+    }
+  }
+
+  return SDValue();
+}
+
+
 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
                                                    DAGCombinerInfo &DCI) const {
+  if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
+    return Folded;
+
   SDValue Cond = N->getOperand(0);
   if (Cond.getOpcode() != ISD::SETCC)
     return SDValue();
@@ -2724,6 +2835,129 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
   return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
 }
 
+SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
+                                                 DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  unsigned Opc = N0.getOpcode();
+
+  // If the input has multiple uses and we can either fold the negate down, or
+  // the other uses cannot, give up. This both prevents unprofitable
+  // transformations and infinite loops: we won't repeatedly try to fold around
+  // a negate that has no 'good' form.
+  //
+  // TODO: Check users can fold
+  if (fnegFoldsIntoOp(Opc) && !N0.hasOneUse())
+    return SDValue();
+
+  SDLoc SL(N);
+  switch (Opc) {
+  case ISD::FADD: {
+    // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
+    SDValue LHS = N0.getOperand(0);
+    SDValue RHS = N0.getOperand(1);
+
+    if (LHS.getOpcode() != ISD::FNEG)
+      LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
+    else
+      LHS = LHS.getOperand(0);
+
+    if (RHS.getOpcode() != ISD::FNEG)
+      RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+    else
+      RHS = RHS.getOperand(0);
+
+    SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS);
+    if (!N0.hasOneUse())
+      DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
+    return Res;
+  }
+  case ISD::FMUL:
+  case AMDGPUISD::FMUL_LEGACY: {
+    // (fneg (fmul x, y)) -> (fmul x, (fneg y))
+    // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
+    SDValue LHS = N0.getOperand(0);
+    SDValue RHS = N0.getOperand(1);
+
+    if (LHS.getOpcode() == ISD::FNEG)
+      LHS = LHS.getOperand(0);
+    else if (RHS.getOpcode() == ISD::FNEG)
+      RHS = RHS.getOperand(0);
+    else
+      RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+
+    SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS);
+    if (!N0.hasOneUse())
+      DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
+    return Res;
+  }
+  case ISD::FMA:
+  case ISD::FMAD: {
+    // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
+    SDValue LHS = N0.getOperand(0);
+    SDValue MHS = N0.getOperand(1);
+    SDValue RHS = N0.getOperand(2);
+
+    if (LHS.getOpcode() == ISD::FNEG)
+      LHS = LHS.getOperand(0);
+    else if (MHS.getOpcode() == ISD::FNEG)
+      MHS = MHS.getOperand(0);
+    else
+      MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
+
+    if (RHS.getOpcode() != ISD::FNEG)
+      RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+    else
+      RHS = RHS.getOperand(0);
+
+    SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
+    if (!N0.hasOneUse())
+      DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
+    return Res;
+  }
+  case ISD::FP_EXTEND:
+  case AMDGPUISD::RCP:
+  case AMDGPUISD::RCP_LEGACY:
+  case ISD::FSIN:
+  case AMDGPUISD::SIN_HW: {
+    SDValue CvtSrc = N0.getOperand(0);
+    if (CvtSrc.getOpcode() == ISD::FNEG) {
+      // (fneg (fp_extend (fneg x))) -> (fp_extend x)
+      // (fneg (rcp (fneg x))) -> (rcp x)
+      return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
+    }
+
+    if (!N0.hasOneUse())
+      return SDValue();
+
+    // (fneg (fp_extend x)) -> (fp_extend (fneg x))
+    // (fneg (rcp x)) -> (rcp (fneg x))
+    SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
+    return DAG.getNode(Opc, SL, VT, Neg);
+  }
+  case ISD::FP_ROUND: {
+    SDValue CvtSrc = N0.getOperand(0);
+
+    if (CvtSrc.getOpcode() == ISD::FNEG) {
+      // (fneg (fp_round (fneg x))) -> (fp_round x)
+      return DAG.getNode(ISD::FP_ROUND, SL, VT,
+                         CvtSrc.getOperand(0), N0.getOperand(1));
+    }
+
+    if (!N0.hasOneUse())
+      return SDValue();
+
+    // (fneg (fp_round x)) -> (fp_round (fneg x))
+    SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
+    return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
+  }
+  default:
+    return SDValue();
+  }
+}
+
 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
                                                 DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -2829,6 +3063,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
     return performMulLoHi24Combine(N, DCI);
   case ISD::SELECT:
     return performSelectCombine(N, DCI);
+  case ISD::FNEG:
+    return performFNegCombine(N, DCI);
   case AMDGPUISD::BFE_I32:
   case AMDGPUISD::BFE_U32: {
     assert(!N->getValueType(0).isVector() &&
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 745c9923de2e..69567aa5f713 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -84,6 +84,7 @@ protected:
   SDValue performCtlzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,
                              SDValue RHS, DAGCombinerInfo &DCI) const;
   SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
 
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index 513df3a9cdf3..59cba636c586 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -629,9 +629,10 @@ def smax_oneuse : HasOneUseBinOp<smax>;
 def smin_oneuse : HasOneUseBinOp<smin>;
 def umax_oneuse : HasOneUseBinOp<umax>;
 def umin_oneuse : HasOneUseBinOp<umin>;
-def sub_oneuse : HasOneUseBinOp<sub>;
 } // Properties = [SDNPCommutative, SDNPAssociative]
 
+def sub_oneuse : HasOneUseBinOp<sub>;
+
 def select_oneuse : HasOneUseTernaryOp<select>;
 
 // Special conversion patterns
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index a1a352642242..e90487065992 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -110,7 +110,7 @@ unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
 int AMDGPUTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
-    TTI::OperandValueProperties Opd2PropInfo) {
+    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) {
 
   EVT OrigTy = TLI->getValueType(DL, Ty);
   if (!OrigTy.isSimple()) {
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 1177007644ff..0d83b2a585bf 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -83,7 +83,8 @@ public:
     TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
     TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
     TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
-    TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+    TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+    ArrayRef<const Value *> Args = ArrayRef<const Value *>());
 
   unsigned getCFInstrCost(unsigned Opcode);
 
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index da9d009c542b..3cf9a1d92469 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -214,7 +214,7 @@ public:
   }
 
   bool isReg() const override {
-    return isRegKind() && !Reg.Mods.hasModifiers();
+    return isRegKind() && !hasModifiers();
   }
 
   bool isRegOrImmWithInputMods(MVT type) const {
@@ -245,6 +245,15 @@ public:
     return isRegOrImmWithInputMods(MVT::f64);
   }
 
+  bool isVReg() const {
+    return isRegClass(AMDGPU::VGPR_32RegClassID) ||
+           isRegClass(AMDGPU::VReg_64RegClassID) ||
+           isRegClass(AMDGPU::VReg_96RegClassID) ||
+           isRegClass(AMDGPU::VReg_128RegClassID) ||
+           isRegClass(AMDGPU::VReg_256RegClassID) ||
+           isRegClass(AMDGPU::VReg_512RegClassID);
+  }
+
   bool isVReg32OrOff() const {
     return isOff() || isRegClass(AMDGPU::VGPR_32RegClassID);
   }
@@ -299,28 +308,32 @@ public:
 
   bool isRegClass(unsigned RCID) const;
 
+  bool isRegOrInlineNoMods(unsigned RCID, MVT type) const {
+    return (isRegClass(RCID) || isInlinableImm(type)) && !hasModifiers();
+  }
+
   bool isSCSrcB16() const {
-    return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::i16);
+    return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::i16);
   }
 
   bool isSCSrcB32() const {
-    return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::i32);
+    return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::i32);
   }
 
   bool isSCSrcB64() const {
-    return isRegClass(AMDGPU::SReg_64RegClassID) || isInlinableImm(MVT::i64);
+    return isRegOrInlineNoMods(AMDGPU::SReg_64RegClassID, MVT::i64);
   }
 
   bool isSCSrcF16() const {
-    return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::f16);
+    return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f16);
   }
 
   bool isSCSrcF32() const {
-    return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::f32);
+    return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f32);
   }
 
   bool isSCSrcF64() const {
-    return isRegClass(AMDGPU::SReg_64RegClassID) || isInlinableImm(MVT::f64);
+    return isRegOrInlineNoMods(AMDGPU::SReg_64RegClassID, MVT::f64);
   }
 
   bool isSSrcB32() const {
@@ -350,27 +363,27 @@ public:
   }
 
   bool isVCSrcB32() const {
-    return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::i32);
+    return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i32);
   }
 
   bool isVCSrcB64() const {
-    return isRegClass(AMDGPU::VS_64RegClassID) || isInlinableImm(MVT::i64);
+    return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::i64);
   }
 
   bool isVCSrcB16() const {
-    return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::i16);
+    return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i16);
   }
 
   bool isVCSrcF32() const {
-    return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::f32);
+    return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::f32);
   }
 
   bool isVCSrcF64() const {
-    return isRegClass(AMDGPU::VS_64RegClassID) || isInlinableImm(MVT::f64);
+    return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::f64);
   }
 
   bool isVCSrcF16() const {
-    return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::f16);
+    return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::f16);
   }
 
   bool isVSrcB32() const {
@@ -534,6 +547,23 @@ public:
     addRegOrImmWithInputModsOperands(Inst, N);
   }
 
+  void addRegWithInputModsOperands(MCInst &Inst, unsigned N) const {
+    Modifiers Mods = getModifiers();
+    Inst.addOperand(MCOperand::createImm(Mods.getModifiersOperand()));
+    assert(isRegKind());
+    addRegOperands(Inst, N);
+  }
+
+  void addRegWithFPInputModsOperands(MCInst &Inst, unsigned N) const {
+    assert(!hasIntModifiers());
+    addRegWithInputModsOperands(Inst, N);
+  }
+
+  void addRegWithIntInputModsOperands(MCInst &Inst, unsigned N) const {
+    assert(!hasFPModifiers());
+    addRegWithInputModsOperands(Inst, N);
+  }
+
   void addSoppBrTargetOperands(MCInst &Inst, unsigned N) const {
     if (isImm())
       addImmOperands(Inst, N);
@@ -852,9 +882,12 @@ public:
                                              StringRef &Value);
 
   OperandMatchResultTy parseImm(OperandVector &Operands);
+  OperandMatchResultTy parseReg(OperandVector &Operands);
   OperandMatchResultTy parseRegOrImm(OperandVector &Operands);
-  OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands);
-  OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands);
+  OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm = true);
+  OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm = true);
+  OperandMatchResultTy parseRegWithFPInputMods(OperandVector &Operands);
+  OperandMatchResultTy parseRegWithIntInputMods(OperandVector &Operands);
   OperandMatchResultTy parseVReg32OrOff(OperandVector &Operands);
 
   void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands);
@@ -1057,7 +1090,7 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const {
 }
 
 bool AMDGPUOperand::isRegClass(unsigned RCID) const {
-  return isReg() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg());
+  return isRegKind() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg());
 }
 
 void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers) const {
@@ -1468,23 +1501,28 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands) {
 }
 
 OperandMatchResultTy
-AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) {
-  auto res = parseImm(Operands);
-  if (res != MatchOperand_NoMatch) {
-    return res;
-  }
-
+AMDGPUAsmParser::parseReg(OperandVector &Operands) {
   if (auto R = parseRegister()) {
     assert(R->isReg());
     R->Reg.IsForcedVOP3 = isForcedVOP3();
     Operands.push_back(std::move(R));
     return MatchOperand_Success;
   }
-  return MatchOperand_ParseFail;
+  return MatchOperand_NoMatch;
 }
 
 OperandMatchResultTy
-AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands) {
+AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) {
+  auto res = parseImm(Operands);
+  if (res != MatchOperand_NoMatch) {
+    return res;
+  }
+
+  return parseReg(Operands);
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm) {
   // XXX: During parsing we can't determine if minus sign means
   // negate-modifier or negative immediate value.
   // By default we suppose it is modifier.
@@ -1514,7 +1552,12 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands) {
     Abs = true;
   }
 
-  auto Res = parseRegOrImm(Operands);
+  OperandMatchResultTy Res;
+  if (AllowImm) {
+    Res = parseRegOrImm(Operands);
+  } else {
+    Res = parseReg(Operands);
+  }
   if (Res != MatchOperand_Success) {
     return Res;
   }
@@ -1548,7 +1591,7 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands) {
 }
 
 OperandMatchResultTy
-AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands) {
+AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm) {
   bool Sext = false;
 
   if (getLexer().getKind() == AsmToken::Identifier && Parser.getTok().getString() == "sext") {
@@ -1561,7 +1604,12 @@ AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands) {
     Parser.Lex();
   }
 
-  auto Res = parseRegOrImm(Operands);
+  OperandMatchResultTy Res;
+  if (AllowImm) {
+    Res = parseRegOrImm(Operands);
+  } else {
+    Res = parseReg(Operands);
+  }
   if (Res != MatchOperand_Success) {
     return Res;
   }
@@ -1584,6 +1632,16 @@ AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
+OperandMatchResultTy
+AMDGPUAsmParser::parseRegWithFPInputMods(OperandVector &Operands) {
+  return parseRegOrImmWithFPInputMods(Operands, false);
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseRegWithIntInputMods(OperandVector &Operands) {
+  return parseRegOrImmWithIntInputMods(Operands, false);
+}
+
 OperandMatchResultTy AMDGPUAsmParser::parseVReg32OrOff(OperandVector &Operands) {
   std::unique_ptr<AMDGPUOperand> Reg = parseRegister();
   if (Reg) {
@@ -3382,7 +3440,7 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) {
       // Skip it.
       continue;
     } if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
-      Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
+      Op.addRegWithFPInputModsOperands(Inst, 2);
     } else if (Op.isDPPCtrl()) {
       Op.addImmOperands(Inst, 1);
     } else if (Op.isImm()) {
@@ -3508,7 +3566,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
       // Skip it.
       continue;
     } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
-      Op.addRegOrImmWithInputModsOperands(Inst, 2);
+      Op.addRegWithInputModsOperands(Inst, 2);
     } else if (Op.isImm()) {
       // Handle optional arguments
       OptionalIdx[Op.getImmTy()] = I;
diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td
index 4112ad100584..48c6592ca5b2 100644
--- a/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -333,11 +333,13 @@ def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24",
 def DOT4_eg : DOT4_Common<0xBE>;
 defm CUBE_eg : CUBE_Common<0xC0>;
 
-def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>;
 
 def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>;
 def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>;
 
+def FLT32_TO_FLT16 : R600_1OP_Helper <0xA2, "FLT32_TO_FLT16", fp_to_f16, VecALU>;
+def FLT16_TO_FLT32 : R600_1OP_Helper <0xA3, "FLT16_TO_FLT32", f16_to_fp, VecALU>;
+def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>;
 def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", AMDGPUffbh_u32, VecALU>;
 def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>;
 
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index 831ac5948a68..a5c0d4923d6b 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -25,25 +25,6 @@ using namespace llvm;
 
 namespace {
 
-class SIFoldOperands : public MachineFunctionPass {
-public:
-  static char ID;
-
-public:
-  SIFoldOperands() : MachineFunctionPass(ID) {
-    initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  StringRef getPassName() const override { return "SI Fold Operands"; }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-
 struct FoldCandidate {
   MachineInstr *UseMI;
   union {
@@ -79,6 +60,36 @@ struct FoldCandidate {
   }
 };
 
+class SIFoldOperands : public MachineFunctionPass {
+public:
+  static char ID;
+  MachineRegisterInfo *MRI;
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
+
+  void foldOperand(MachineOperand &OpToFold,
+                   MachineInstr *UseMI,
+                   unsigned UseOpIdx,
+                   SmallVectorImpl<FoldCandidate> &FoldList,
+                   SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
+
+  void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
+
+public:
+  SIFoldOperands() : MachineFunctionPass(ID) {
+    initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return "SI Fold Operands"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
 } // End anonymous namespace.
 
 INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
@@ -88,6 +99,34 @@ char SIFoldOperands::ID = 0;
 
 char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
 
+// Wrapper around isInlineConstant that understands special cases when
+// instruction types are replaced during operand folding.
+static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
+                                     const MachineInstr &UseMI,
+                                     unsigned OpNo,
+                                     const MachineOperand &OpToFold) {
+  if (TII->isInlineConstant(UseMI, OpNo, OpToFold))
+    return true;
+
+  unsigned Opc = UseMI.getOpcode();
+  switch (Opc) {
+  case AMDGPU::V_MAC_F32_e64:
+  case AMDGPU::V_MAC_F16_e64: {
+    // Special case for mac. Since this is replaced with mad when folded into
+    // src2, we need to check the legality for the final instruction.
+    int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+    if (static_cast<int>(OpNo) == Src2Idx) {
+      bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
+      const MCInstrDesc &MadDesc
+        = TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
+      return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
+    }
+  }
+  default:
+    return false;
+  }
+}
+
 FunctionPass *llvm::createSIFoldOperandsPass() {
   return new SIFoldOperands();
 }
@@ -141,7 +180,7 @@ static bool updateOperand(FoldCandidate &Fold,
   return false;
 }
 
-static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList,
+static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,
                               const MachineInstr *MI) {
   for (auto Candidate : FoldList) {
     if (Candidate.UseMI == MI)
@@ -150,7 +189,7 @@ static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList,
   return false;
 }
 
-static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
+static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
                              MachineInstr *MI, unsigned OpNo,
                              MachineOperand *OpToFold,
                              const SIInstrInfo *TII) {
@@ -160,7 +199,7 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
     unsigned Opc = MI->getOpcode();
     if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64) &&
         (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
-      bool IsF32  = Opc == AMDGPU::V_MAC_F32_e64;
+      bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
 
       // Check if changing this to a v_mad_{f16, f32} instruction will allow us
       // to fold the operand.
@@ -227,12 +266,12 @@ static bool isUseSafeToFold(const MachineInstr &MI,
   //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
 }
 
-static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
-                        unsigned UseOpIdx,
-                        std::vector<FoldCandidate> &FoldList,
-                        SmallVectorImpl<MachineInstr *> &CopiesToReplace,
-                        const SIInstrInfo *TII, const SIRegisterInfo &TRI,
-                        MachineRegisterInfo &MRI) {
+void SIFoldOperands::foldOperand(
+  MachineOperand &OpToFold,
+  MachineInstr *UseMI,
+  unsigned UseOpIdx,
+  SmallVectorImpl<FoldCandidate> &FoldList,
+  SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
   const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
 
   if (!isUseSafeToFold(*UseMI, UseOp))
@@ -264,7 +303,7 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
     unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
 
     for (MachineRegisterInfo::use_iterator
-           RSUse = MRI.use_begin(RegSeqDstReg), RSE = MRI.use_end();
+           RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end();
          RSUse != RSE; ++RSUse) {
 
       MachineInstr *RSUseMI = RSUse->getParent();
@@ -272,7 +311,7 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
         continue;
 
       foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
-                  CopiesToReplace, TII, TRI, MRI);
+                  CopiesToReplace);
     }
 
     return;
@@ -287,8 +326,8 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
     unsigned DestReg = UseMI->getOperand(0).getReg();
     const TargetRegisterClass *DestRC
       = TargetRegisterInfo::isVirtualRegister(DestReg) ?
-      MRI.getRegClass(DestReg) :
-      TRI.getPhysRegClass(DestReg);
+      MRI->getRegClass(DestReg) :
+      TRI->getPhysRegClass(DestReg);
 
     unsigned MovOp = TII->getMovOpcode(DestRC);
     if (MovOp == AMDGPU::COPY)
@@ -318,7 +357,7 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
 
   const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
   const TargetRegisterClass *FoldRC =
-    TRI.getRegClass(FoldDesc.OpInfo[0].RegClass);
+    TRI->getRegClass(FoldDesc.OpInfo[0].RegClass);
 
   APInt Imm(TII->operandBitWidth(FoldDesc.OpInfo[1].OperandType),
             OpToFold.getImm());
@@ -328,8 +367,8 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
     unsigned UseReg = UseOp.getReg();
     const TargetRegisterClass *UseRC
       = TargetRegisterInfo::isVirtualRegister(UseReg) ?
-      MRI.getRegClass(UseReg) :
-      TRI.getPhysRegClass(UseReg);
+      MRI->getRegClass(UseReg) :
+      TRI->getPhysRegClass(UseReg);
 
     assert(Imm.getBitWidth() == 64);
 
@@ -349,20 +388,51 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
 }
 
 static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
-                                  int32_t LHS, int32_t RHS) {
+                                  uint32_t LHS, uint32_t RHS) {
   switch (Opcode) {
   case AMDGPU::V_AND_B32_e64:
+  case AMDGPU::V_AND_B32_e32:
   case AMDGPU::S_AND_B32:
     Result = LHS & RHS;
     return true;
   case AMDGPU::V_OR_B32_e64:
+  case AMDGPU::V_OR_B32_e32:
   case AMDGPU::S_OR_B32:
     Result = LHS | RHS;
     return true;
   case AMDGPU::V_XOR_B32_e64:
+  case AMDGPU::V_XOR_B32_e32:
   case AMDGPU::S_XOR_B32:
     Result = LHS ^ RHS;
     return true;
+  case AMDGPU::V_LSHL_B32_e64:
+  case AMDGPU::V_LSHL_B32_e32:
+  case AMDGPU::S_LSHL_B32:
+    // The instruction ignores the high bits for out of bounds shifts.
+    Result = LHS << (RHS & 31);
+    return true;
+  case AMDGPU::V_LSHLREV_B32_e64:
+  case AMDGPU::V_LSHLREV_B32_e32:
+    Result = RHS << (LHS & 31);
+    return true;
+  case AMDGPU::V_LSHR_B32_e64:
+  case AMDGPU::V_LSHR_B32_e32:
+  case AMDGPU::S_LSHR_B32:
+    Result = LHS >> (RHS & 31);
+    return true;
+  case AMDGPU::V_LSHRREV_B32_e64:
+  case AMDGPU::V_LSHRREV_B32_e32:
+    Result = RHS >> (LHS & 31);
+    return true;
+  case AMDGPU::V_ASHR_I32_e64:
+  case AMDGPU::V_ASHR_I32_e32:
+  case AMDGPU::S_ASHR_I32:
+    Result = static_cast<int32_t>(LHS) >> (RHS & 31);
+    return true;
+  case AMDGPU::V_ASHRREV_I32_e64:
+  case AMDGPU::V_ASHRREV_I32_e32:
+    Result = static_cast<int32_t>(RHS) >> (LHS & 31);
+    return true;
   default:
     return false;
   }
@@ -390,33 +460,47 @@ static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
   stripExtraCopyOperands(MI);
 }
 
+static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
+                                               MachineOperand &Op) {
+  if (Op.isReg()) {
+    // If this has a subregister, it obviously is a register source.
+    if (Op.getSubReg() != AMDGPU::NoSubRegister)
+      return &Op;
+
+    MachineInstr *Def = MRI.getVRegDef(Op.getReg());
+    if (Def->isMoveImmediate()) {
+      MachineOperand &ImmSrc = Def->getOperand(1);
+      if (ImmSrc.isImm())
+        return &ImmSrc;
+    }
+  }
+
+  return &Op;
+}
+
 // Try to simplify operations with a constant that may appear after instruction
 // selection.
+// TODO: See if a frame index with a fixed offset can fold.
 static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
                               const SIInstrInfo *TII,
-                              MachineInstr *MI) {
+                              MachineInstr *MI,
+                              MachineOperand *ImmOp) {
   unsigned Opc = MI->getOpcode();
-
   if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
       Opc == AMDGPU::S_NOT_B32) {
-    MachineOperand &Src0 = MI->getOperand(1);
-    if (Src0.isImm()) {
-      Src0.setImm(~Src0.getImm());
-      mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
-      return true;
-    }
-
-    return false;
+    MI->getOperand(1).ChangeToImmediate(~ImmOp->getImm());
+    mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
+    return true;
   }
 
-  if (!MI->isCommutable())
+  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+  if (Src1Idx == -1)
     return false;
 
   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
-  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+  MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx));
+  MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx));
 
-  MachineOperand *Src0 = &MI->getOperand(Src0Idx);
-  MachineOperand *Src1 = &MI->getOperand(Src1Idx);
   if (!Src0->isImm() && !Src1->isImm())
     return false;
 
@@ -431,19 +515,26 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
     const SIRegisterInfo &TRI = TII->getRegisterInfo();
     bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg());
 
-    Src0->setImm(NewImm);
+    // Be careful to change the right operand, src0 may belong to a different
+    // instruction.
+    MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
     MI->RemoveOperand(Src1Idx);
     mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
     return true;
   }
 
+  if (!MI->isCommutable())
+    return false;
+
   if (Src0->isImm() && !Src1->isImm()) {
     std::swap(Src0, Src1);
     std::swap(Src0Idx, Src1Idx);
   }
 
   int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
-  if (Opc == AMDGPU::V_OR_B32_e64 || Opc == AMDGPU::S_OR_B32) {
+  if (Opc == AMDGPU::V_OR_B32_e64 ||
+      Opc == AMDGPU::V_OR_B32_e32 ||
+      Opc == AMDGPU::S_OR_B32) {
     if (Src1Val == 0) {
       // y = or x, 0 => y = copy x
       MI->RemoveOperand(Src1Idx);
@@ -459,6 +550,7 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
   }
 
   if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 ||
+      MI->getOpcode() == AMDGPU::V_AND_B32_e32 ||
       MI->getOpcode() == AMDGPU::S_AND_B32) {
     if (Src1Val == 0) {
       // y = and x, 0 => y = v_mov_b32 0
@@ -476,29 +568,136 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
   }
 
   if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 ||
+      MI->getOpcode() == AMDGPU::V_XOR_B32_e32 ||
       MI->getOpcode() == AMDGPU::S_XOR_B32) {
     if (Src1Val == 0) {
       // y = xor x, 0 => y = copy x
       MI->RemoveOperand(Src1Idx);
       mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
+      return true;
     }
   }
 
   return false;
 }
 
+void SIFoldOperands::foldInstOperand(MachineInstr &MI,
+                                     MachineOperand &OpToFold) const {
+  // We need mutate the operands of new mov instructions to add implicit
+  // uses of EXEC, but adding them invalidates the use_iterator, so defer
+  // this.
+  SmallVector<MachineInstr *, 4> CopiesToReplace;
+  SmallVector<FoldCandidate, 4> FoldList;
+  MachineOperand &Dst = MI.getOperand(0);
+
+  bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
+  if (FoldingImm) {
+    unsigned NumLiteralUses = 0;
+    MachineOperand *NonInlineUse = nullptr;
+    int NonInlineUseOpNo = -1;
+
+    MachineRegisterInfo::use_iterator NextUse, NextInstUse;
+    for (MachineRegisterInfo::use_iterator
+           Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
+         Use != E; Use = NextUse) {
+      NextUse = std::next(Use);
+      MachineInstr *UseMI = Use->getParent();
+      unsigned OpNo = Use.getOperandNo();
+
+      // Folding the immediate may reveal operations that can be constant
+      // folded or replaced with a copy. This can happen for example after
+      // frame indices are lowered to constants or from splitting 64-bit
+      // constants.
+      //
+      // We may also encounter cases where one or both operands are
+      // immediates materialized into a register, which would ordinarily not
+      // be folded due to multiple uses or operand constraints.
+
+      if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) {
+        DEBUG(dbgs() << "Constant folded " << *UseMI <<'\n');
+
+        // Some constant folding cases change the same immediate's use to a new
+        // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
+        // again. The same constant folded instruction could also have a second
+        // use operand.
+        NextUse = MRI->use_begin(Dst.getReg());
+        continue;
+      }
+
+      // Try to fold any inline immediate uses, and then only fold other
+      // constants if they have one use.
+      //
+      // The legality of the inline immediate must be checked based on the use
+      // operand, not the defining instruction, because 32-bit instructions
+      // with 32-bit inline immediate sources may be used to materialize
+      // constants used in 16-bit operands.
+      //
+      // e.g. it is unsafe to fold:
+      //  s_mov_b32 s0, 1.0    // materializes 0x3f800000
+      //  v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
+
+      // Folding immediates with more than one use will increase program size.
+      // FIXME: This will also reduce register usage, which may be better
+      // in some cases. A better heuristic is needed.
+      if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
+        foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
+      } else {
+        if (++NumLiteralUses == 1) {
+          NonInlineUse = &*Use;
+          NonInlineUseOpNo = OpNo;
+        }
+      }
+    }
+
+    if (NumLiteralUses == 1) {
+      MachineInstr *UseMI = NonInlineUse->getParent();
+      foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace);
+    }
+  } else {
+    // Folding register.
+    for (MachineRegisterInfo::use_iterator
+           Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
+         Use != E; ++Use) {
+      MachineInstr *UseMI = Use->getParent();
+
+      foldOperand(OpToFold, UseMI, Use.getOperandNo(),
+                  FoldList, CopiesToReplace);
+    }
+  }
+
+  MachineFunction *MF = MI.getParent()->getParent();
+  // Make sure we add EXEC uses to any new v_mov instructions created.
+  for (MachineInstr *Copy : CopiesToReplace)
+    Copy->addImplicitDefUseOperands(*MF);
+
+  for (FoldCandidate &Fold : FoldList) {
+    if (updateOperand(Fold, *TRI)) {
+      // Clear kill flags.
+      if (Fold.isReg()) {
+        assert(Fold.OpToFold && Fold.OpToFold->isReg());
+        // FIXME: Probably shouldn't bother trying to fold if not an
+        // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
+        // copies.
+        MRI->clearKillFlags(Fold.OpToFold->getReg());
+      }
+      DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
+            static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n');
+    }
+  }
+}
+
 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(*MF.getFunction()))
     return false;
 
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
 
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SIInstrInfo *TII = ST.getInstrInfo();
-  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  TII = ST.getInstrInfo();
+  TRI = &TII->getRegisterInfo();
 
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-                                                  BI != BE; ++BI) {
+       BI != BE; ++BI) {
 
     MachineBasicBlock &MBB = *BI;
     MachineBasicBlock::iterator I, Next;
@@ -512,8 +711,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
       MachineOperand &OpToFold = MI.getOperand(1);
       bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
 
-      // FIXME: We could also be folding things like FrameIndexes and
-      // TargetIndexes.
+      // FIXME: We could also be folding things like TargetIndexes.
       if (!FoldingImm && !OpToFold.isReg())
         continue;
 
@@ -532,90 +730,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
           !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
         continue;
 
-      // We need mutate the operands of new mov instructions to add implicit
-      // uses of EXEC, but adding them invalidates the use_iterator, so defer
-      // this.
-      SmallVector<MachineInstr *, 4> CopiesToReplace;
-
-      std::vector<FoldCandidate> FoldList;
-      if (FoldingImm) {
-        unsigned NumLiteralUses = 0;
-        MachineOperand *NonInlineUse = nullptr;
-        int NonInlineUseOpNo = -1;
-
-        // Try to fold any inline immediate uses, and then only fold other
-        // constants if they have one use.
-        //
-        // The legality of the inline immediate must be checked based on the use
-        // operand, not the defining instruction, because 32-bit instructions
-        // with 32-bit inline immediate sources may be used to materialize
-        // constants used in 16-bit operands.
-        //
-        // e.g. it is unsafe to fold:
-        //  s_mov_b32 s0, 1.0    // materializes 0x3f800000
-        //  v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
-
-        // Folding immediates with more than one use will increase program size.
-        // FIXME: This will also reduce register usage, which may be better
-        // in some cases. A better heuristic is needed.
-        for (MachineRegisterInfo::use_iterator
-               Use = MRI.use_begin(Dst.getReg()), E = MRI.use_end();
-             Use != E; ++Use) {
-          MachineInstr *UseMI = Use->getParent();
-          unsigned OpNo = Use.getOperandNo();
-
-          if (TII->isInlineConstant(*UseMI, OpNo, OpToFold)) {
-            foldOperand(OpToFold, UseMI, OpNo, FoldList,
-                        CopiesToReplace, TII, TRI, MRI);
-          } else {
-            if (++NumLiteralUses == 1) {
-              NonInlineUse = &*Use;
-              NonInlineUseOpNo = OpNo;
-            }
-          }
-        }
-
-        if (NumLiteralUses == 1) {
-          MachineInstr *UseMI = NonInlineUse->getParent();
-          foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList,
-                      CopiesToReplace, TII, TRI, MRI);
-        }
-      } else {
-        // Folding register.
-        for (MachineRegisterInfo::use_iterator
-               Use = MRI.use_begin(Dst.getReg()), E = MRI.use_end();
-             Use != E; ++Use) {
-          MachineInstr *UseMI = Use->getParent();
-
-          foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList,
-                      CopiesToReplace, TII, TRI, MRI);
-        }
-      }
-
-      // Make sure we add EXEC uses to any new v_mov instructions created.
-      for (MachineInstr *Copy : CopiesToReplace)
-        Copy->addImplicitDefUseOperands(MF);
-
-      for (FoldCandidate &Fold : FoldList) {
-        if (updateOperand(Fold, TRI)) {
-          // Clear kill flags.
-          if (Fold.isReg()) {
-            assert(Fold.OpToFold && Fold.OpToFold->isReg());
-            // FIXME: Probably shouldn't bother trying to fold if not an
-            // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
-            // copies.
-            MRI.clearKillFlags(Fold.OpToFold->getReg());
-          }
-          DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
-                static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n');
-
-          // Folding the immediate may reveal operations that can be constant
-          // folded or replaced with a copy. This can happen for example after
-          // frame indices are lowered to constants or from splitting 64-bit
-          // constants.
-          tryConstantFoldOp(MRI, TII, Fold.UseMI);
-        }
-      }
+      foldInstOperand(MI, OpToFold);
     }
   }
   return false;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 34096e158039..ebaefae3bfef 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -557,6 +557,27 @@ class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass>
 def Int32InputMods : IntInputMods<Int32InputModsMatchClass>;
 def Int64InputMods : IntInputMods<Int64InputModsMatchClass>;
 
+def FPVRegInputModsMatchClass : AsmOperandClass {
+  let Name = "VRegWithFPInputMods";
+  let ParserMethod = "parseRegWithFPInputMods";
+  let PredicateMethod = "isVReg";
+}
+
+def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> {
+  let PrintMethod = "printOperandAndFPInputMods";
+}
+
+def IntVRegInputModsMatchClass : AsmOperandClass {
+  let Name = "VRegWithIntInputMods";
+  let ParserMethod = "parseRegWithIntInputMods";
+  let PredicateMethod = "isVReg";
+}
+
+def IntVRegInputMods : InputMods <IntVRegInputModsMatchClass> {
+  let PrintMethod = "printOperandAndIntInputMods";
+}
+
+
 //===----------------------------------------------------------------------===//
 // Complex patterns
 //===----------------------------------------------------------------------===//
@@ -761,6 +782,15 @@ class getSrcMod <ValueType VT> {
                      );
 }
 
+// Return type of input modifiers operand specified input operand for SDWA/DPP
+class getSrcModExt <ValueType VT> {
+    bit isFP = !if(!eq(VT.Value, f16.Value), 1,
+               !if(!eq(VT.Value, f32.Value), 1,
+               !if(!eq(VT.Value, f64.Value), 1,
+               0)));
+  Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods);
+}
+
 // Returns the input arguments for VOP[12C] instructions for the given SrcVT.
 class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
   dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0),               // VOP1
@@ -1001,6 +1031,11 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field Operand Src0Mod = getSrcMod<Src0VT>.ret;
   field Operand Src1Mod = getSrcMod<Src1VT>.ret;
   field Operand Src2Mod = getSrcMod<Src2VT>.ret;
+  field Operand Src0ModDPP = getSrcModExt<Src0VT>.ret;
+  field Operand Src1ModDPP = getSrcModExt<Src1VT>.ret;
+  field Operand Src0ModSDWA = getSrcModExt<Src0VT>.ret;
+  field Operand Src1ModSDWA = getSrcModExt<Src1VT>.ret;
+  
 
   field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1);
   field bit HasDst32 = HasDst;
@@ -1038,15 +1073,16 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field dag Outs32 = Outs;
   field dag Outs64 = Outs;
   field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
-  field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
+  field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCSDWA>.ret;
 
   field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
   field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
                              HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret;
   field dag InsDPP = getInsDPP<Src0DPP, Src1DPP, NumSrcArgs,
-                               HasModifiers, Src0Mod, Src1Mod>.ret;
+                               HasModifiers, Src0ModDPP, Src1ModDPP>.ret;
   field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,
-                                 HasModifiers, Src0Mod, Src1Mod, DstVT>.ret;
+                                 HasModifiers, Src0ModSDWA, Src1ModSDWA,
+                                 DstVT>.ret;
 
   field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret;
   field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index bc35c2edc8d3..b86c04191189 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -871,6 +871,11 @@ def : Pat <
 >;
 
 def : Pat <
+  (i16 (sext_inreg i16:$src, i1)),
+  (S_BFE_I32 $src, (i32 0x00010000)) // 0 | 1 << 16
+>;
+
+def : Pat <
   (i16 (sext_inreg i16:$src, i8)),
   (S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16
 >;
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index b27d7c691032..dd31dc690840 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -84,12 +84,17 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
   // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
   // a special case for it.  It can only be shrunk if the third operand
   // is vcc.  We should handle this the same way we handle vopc, by addding
-  // a register allocation hint pre-regalloc and then do the shrining
+  // a register allocation hint pre-regalloc and then do the shrinking
   // post-regalloc.
   if (Src2) {
     switch (MI.getOpcode()) {
       default: return false;
 
+      case AMDGPU::V_ADDC_U32_e64:
+      case AMDGPU::V_SUBB_U32_e64:
+        // Additional verification is needed for sdst/src2.
+        return true;
+
       case AMDGPU::V_MAC_F32_e64:
       case AMDGPU::V_MAC_F16_e64:
         if (!isVGPR(Src2, TRI, MRI) ||
@@ -174,7 +179,7 @@ static void copyFlagsToImplicitVCC(MachineInstr &MI,
                                    const MachineOperand &Orig) {
 
   for (MachineOperand &Use : MI.implicit_operands()) {
-    if (Use.getReg() == AMDGPU::VCC) {
+    if (Use.isUse() && Use.getReg() == AMDGPU::VCC) {
       Use.setIsUndef(Orig.isUndef());
       Use.setIsKill(Orig.isKill());
       return;
@@ -456,6 +461,31 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
           continue;
       }
 
+      // Check for the bool flag output for instructions like V_ADD_I32_e64.
+      const MachineOperand *SDst = TII->getNamedOperand(MI,
+                                                        AMDGPU::OpName::sdst);
+
+      // Check the carry-in operand for v_addc_u32_e64.
+      const MachineOperand *Src2 = TII->getNamedOperand(MI,
+                                                        AMDGPU::OpName::src2);
+
+      if (SDst) {
+        if (SDst->getReg() != AMDGPU::VCC) {
+          if (TargetRegisterInfo::isVirtualRegister(SDst->getReg()))
+            MRI.setRegAllocationHint(SDst->getReg(), 0, AMDGPU::VCC);
+          continue;
+        }
+
+        // All of the instructions with carry outs also have an SGPR input in
+        // src2.
+        if (Src2 && Src2->getReg() != AMDGPU::VCC) {
+          if (TargetRegisterInfo::isVirtualRegister(Src2->getReg()))
+            MRI.setRegAllocationHint(Src2->getReg(), 0, AMDGPU::VCC);
+
+          continue;
+        }
+      }
+
       // We can shrink this instruction
       DEBUG(dbgs() << "Shrinking " << MI);
 
@@ -481,8 +511,6 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       if (Src1)
         Inst32.addOperand(*Src1);
 
-      const MachineOperand *Src2 =
-        TII->getNamedOperand(MI, AMDGPU::OpName::src2);
       if (Src2) {
         int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
         if (Op32Src2Idx != -1) {
diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td
index bff706cdc1dc..a15b9ceff2f4 100644
--- a/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/lib/Target/AMDGPU/VOP1Instructions.td
@@ -232,7 +232,7 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {
   let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0);
   let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
-  let InsSDWA = (ins Src0RC32:$vdst, Int32InputMods:$src0_modifiers, VCSrc_b32:$src0,
+  let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, VCSrc_b32:$src0,
                      clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
                      src0_sel:$src0_sel);
 
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index 20fb7f7bcab7..00e5ab3db0b7 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -183,13 +183,13 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
   let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
   let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
                        HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret;
-  let InsDPP = (ins FP32InputMods:$src0_modifiers, Src0DPP:$src0,
-                    FP32InputMods:$src1_modifiers, Src1DPP:$src1,
+  let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
+                    Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
                     VGPR_32:$src2, // stub argument
                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
-  let InsSDWA = (ins FP32InputMods:$src0_modifiers, Src0SDWA:$src0,
-                     FP32InputMods:$src1_modifiers, Src1SDWA:$src1,
+  let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
+                     Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
                      VGPR_32:$src2, // stub argument
                      clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
                      src0_sel:$src0_sel, src1_sel:$src1_sel);
diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td
index c431d9db801e..16a456da3c67 100644
--- a/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/lib/Target/AMDGPU/VOPCInstructions.td
@@ -517,8 +517,8 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> :
   VOPC_Profile<sched, vt, i32> {
   let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
   let Asm64 = "$sdst, $src0_modifiers, $src1";
-  let InsSDWA = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0,
-                     Int32InputMods:$src1_modifiers, Src1RC64:$src1,
+  let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
+                     Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
                      clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel);
   let AsmSDWA = " vcc, $src0_modifiers, $src1_modifiers$clamp $src0_sel $src1_sel";
   let HasSrc1Mods = 0;
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index afba1587a743..32b7c87e61bb 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -608,15 +608,27 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
   // a __gnu_ prefix (which is the default).
   if (Subtarget->isTargetAEABI()) {
-    setLibcallName(RTLIB::FPROUND_F32_F16, "__aeabi_f2h");
-    setLibcallName(RTLIB::FPROUND_F64_F16, "__aeabi_d2h");
-    setLibcallName(RTLIB::FPEXT_F16_F32,   "__aeabi_h2f");
+    static const struct {
+      const RTLIB::Libcall Op;
+      const char * const Name;
+      const CallingConv::ID CC;
+    } LibraryCalls[] = {
+      { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS },
+      { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS },
+      { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS },
+    };
+
+    for (const auto &LC : LibraryCalls) {
+      setLibcallName(LC.Op, LC.Name);
+      setLibcallCallingConv(LC.Op, LC.CC);
+    }
   }
 
   if (Subtarget->isThumb1Only())
     addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
   else
     addRegisterClass(MVT::i32, &ARM::GPRRegClass);
+
   if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
       !Subtarget->isThumb1Only()) {
     addRegisterClass(MVT::f32, &ARM::SPRRegClass);
@@ -976,6 +988,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::SREM,  MVT::i32, Expand);
   setOperationAction(ISD::UREM,  MVT::i32, Expand);
+
   // Register based DivRem for AEABI (RTABI 4.2)
   if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
       Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
@@ -984,29 +997,49 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::UREM, MVT::i64, Custom);
     HasStandaloneRem = false;
 
-    for (const auto &LC :
-         {RTLIB::SDIVREM_I8, RTLIB::SDIVREM_I16, RTLIB::SDIVREM_I32})
-      setLibcallName(LC, Subtarget->isTargetWindows() ? "__rt_sdiv"
-                                                      : "__aeabi_idivmod");
-    setLibcallName(RTLIB::SDIVREM_I64, Subtarget->isTargetWindows()
-                                           ? "__rt_sdiv64"
-                                           : "__aeabi_ldivmod");
-    for (const auto &LC :
-         {RTLIB::UDIVREM_I8, RTLIB::UDIVREM_I16, RTLIB::UDIVREM_I32})
-      setLibcallName(LC, Subtarget->isTargetWindows() ? "__rt_udiv"
-                                                      : "__aeabi_uidivmod");
-    setLibcallName(RTLIB::UDIVREM_I64, Subtarget->isTargetWindows()
-                                           ? "__rt_udiv64"
-                                           : "__aeabi_uldivmod");
-
-    setLibcallCallingConv(RTLIB::SDIVREM_I8, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SDIVREM_I16, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SDIVREM_I32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SDIVREM_I64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UDIVREM_I8, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UDIVREM_I16, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UDIVREM_I32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UDIVREM_I64, CallingConv::ARM_AAPCS);
+    if (Subtarget->isTargetWindows()) {
+      const struct {
+        const RTLIB::Libcall Op;
+        const char * const Name;
+        const CallingConv::ID CC;
+      } LibraryCalls[] = {
+        { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS },
+        { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS },
+        { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS },
+        { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS },
+
+        { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS },
+        { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS },
+        { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS },
+        { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS },
+      };
+
+      for (const auto &LC : LibraryCalls) {
+        setLibcallName(LC.Op, LC.Name);
+        setLibcallCallingConv(LC.Op, LC.CC);
+      }
+    } else {
+      const struct {
+        const RTLIB::Libcall Op;
+        const char * const Name;
+        const CallingConv::ID CC;
+      } LibraryCalls[] = {
+        { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
+        { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
+        { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
+        { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS },
+
+        { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
+        { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
+        { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
+        { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS },
+      };
+
+      for (const auto &LC : LibraryCalls) {
+        setLibcallName(LC.Op, LC.Name);
+        setLibcallCallingConv(LC.Op, LC.CC);
+      }
+    }
 
     setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
     setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
@@ -3305,11 +3338,6 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
   SDLoc dl(Op);
   switch (IntNo) {
   default: return SDValue();    // Don't custom lower most intrinsics.
-  case Intrinsic::arm_rbit: {
-    assert(Op.getOperand(1).getValueType() == MVT::i32 &&
-           "RBIT intrinsic must have i32 type!");
-    return DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Op.getOperand(1));
-  }
   case Intrinsic::thread_pointer: {
     EVT PtrVT = getPointerTy(DAG.getDataLayout());
     return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
@@ -9232,12 +9260,102 @@ SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
   return SDValue();
 }
 
-// AddCombineToVPADDL- For pair-wise add on neon, use the vpaddl instruction
-// (only after legalization).
-static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
+static bool IsVUZPShuffleNode(SDNode *N) {
+  // VUZP shuffle node.
+  if (N->getOpcode() == ARMISD::VUZP)
+    return true;
+
+  // "VUZP" on i32 is an alias for VTRN.
+  if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
+    return true;
+
+  return false;
+}
+
+static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const ARMSubtarget *Subtarget) {
+  // Look for ADD(VUZP.0, VUZP.1).
+  if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
+      N0 == N1)
+   return SDValue();
+
+  // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
+  if (!N->getValueType(0).is64BitVector())
+    return SDValue();
 
+  // Generate vpadd.
+  SelectionDAG &DAG = DCI.DAG;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDLoc dl(N);
+  SDNode *Unzip = N0.getNode();
+  EVT VT = N->getValueType(0);
+
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
+                                TLI.getPointerTy(DAG.getDataLayout())));
+  Ops.push_back(Unzip->getOperand(0));
+  Ops.push_back(Unzip->getOperand(1));
+
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
+}
+
+static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1,
+                                      TargetLowering::DAGCombinerInfo &DCI,
+                                      const ARMSubtarget *Subtarget) {
+  // Check for two extended operands.
+  if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
+        N1.getOpcode() == ISD::SIGN_EXTEND) &&
+      !(N0.getOpcode() == ISD::ZERO_EXTEND &&
+        N1.getOpcode() == ISD::ZERO_EXTEND))
+    return SDValue();
+
+  SDValue N00 = N0.getOperand(0);
+  SDValue N10 = N1.getOperand(0);
+
+  // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
+  if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
+      N00 == N10)
+    return SDValue();
+
+  // We only recognize Q register paddl here; this can't be reached until
+  // after type legalization.
+  if (!N00.getValueType().is64BitVector() ||
+      !N0.getValueType().is128BitVector())
+    return SDValue();
+
+  // Generate vpaddl.
+  SelectionDAG &DAG = DCI.DAG;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+
+  SmallVector<SDValue, 8> Ops;
+  // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
+  unsigned Opcode;
+  if (N0.getOpcode() == ISD::SIGN_EXTEND)
+    Opcode = Intrinsic::arm_neon_vpaddls;
+  else
+    Opcode = Intrinsic::arm_neon_vpaddlu;
+  Ops.push_back(DAG.getConstant(Opcode, dl,
+                                TLI.getPointerTy(DAG.getDataLayout())));
+  EVT ElemTy = N00.getValueType().getVectorElementType();
+  unsigned NumElts = VT.getVectorNumElements();
+  EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
+  SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
+                               N00.getOperand(0), N00.getOperand(1));
+  Ops.push_back(Concat);
+
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
+}
+
+// FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
+// an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
+// much easier to match.
+static SDValue
+AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1,
+                               TargetLowering::DAGCombinerInfo &DCI,
+                               const ARMSubtarget *Subtarget) {
   // Only perform optimization if after legalize, and if NEON is available. We
   // also expected both operands to be BUILD_VECTORs.
   if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
@@ -9293,6 +9411,10 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
       return SDValue();
   }
 
+  // Don't generate vpaddl+vmovn; we'll match it to vpadd later.
+  if (Vec.getValueType().getVectorElementType() == VT.getVectorElementType())
+    return SDValue();
+
   // Create VPADDL node.
   SelectionDAG &DAG = DCI.DAG;
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -9564,9 +9686,15 @@ static SDValue PerformADDCCombine(SDNode *N,
 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
                                           TargetLowering::DAGCombinerInfo &DCI,
                                           const ARMSubtarget *Subtarget){
+  // Attempt to create vpadd for this add.
+  if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
+    return Result;
 
   // Attempt to create vpaddl for this add.
-  if (SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget))
+  if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
+    return Result;
+  if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
+                                                      Subtarget))
     return Result;
 
   // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 5255d82d647a..7a7f91f4d3c4 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -16,16 +16,28 @@
 #define LLVM_LIB_TARGET_ARM_ARMISELLOWERING_H
 
 #include "MCTargetDesc/ARMBaseInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Target/TargetLowering.h"
-#include <vector>
+#include <utility>
 
 namespace llvm {
-  class ARMConstantPoolValue;
-  class ARMSubtarget;
+
+class ARMSubtarget;
+class InstrItineraryData;
 
   namespace ARMISD {
+
     // ARM Specific DAG Nodes
     enum NodeType : unsigned {
       // Start the numbering where the builtin ops and target ops leave off.
@@ -217,12 +229,15 @@ namespace llvm {
       VST3LN_UPD,
       VST4LN_UPD
     };
-  }
+
+  } // end namespace ARMISD
 
   /// Define some predicates that are used for node matching.
   namespace ARM {
+
     bool isBitFieldInvertedMask(unsigned v);
-  }
+
+  } // end namespace ARM
 
   //===--------------------------------------------------------------------===//
   //  ARMTargetLowering - ARM Implementation of the TargetLowering interface
@@ -531,6 +546,7 @@ namespace llvm {
     std::pair<SDValue, SDValue> getARMXALUOOp(SDValue Op, SelectionDAG &DAG, SDValue &ARMcc) const;
 
     typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector;
+
     void PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, SDValue Chain,
                           SDValue &Arg, RegsToPassVector &RegsToPass,
                           CCValAssign &VA, CCValAssign &NextVA,
@@ -623,6 +639,7 @@ namespace llvm {
       return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
           MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
     }
+
     void initializeSplitCSR(MachineBasicBlock *Entry) const override;
     void insertCopiesSplitCSR(
       MachineBasicBlock *Entry,
@@ -644,9 +661,8 @@ namespace llvm {
                               unsigned ArgOffset, unsigned TotalArgRegsSaveSize,
                               bool ForceMutable = false) const;
 
-    SDValue
-      LowerCall(TargetLowering::CallLoweringInfo &CLI,
-                SmallVectorImpl<SDValue> &InVals) const override;
+    SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                      SmallVectorImpl<SDValue> &InVals) const override;
 
     /// HandleByVal - Target-specific cleanup for ByVal support.
     void HandleByVal(CCState *, unsigned &, unsigned) const override;
@@ -712,9 +728,12 @@ namespace llvm {
   };
 
   namespace ARM {
+
     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
                              const TargetLibraryInfo *libInfo);
-  }
-}
 
-#endif  // ARMISELLOWERING_H
+  } // end namespace ARM
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_ARM_ARMISELLOWERING_H
diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp
index 9bd036a1eace..324087d670b5 100644
--- a/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -29,7 +29,33 @@ using namespace llvm;
 // into an ARMGenRegisterBankInfo.def (similar to AArch64).
 namespace llvm {
 namespace ARM {
-RegisterBank GPRRegBank;
+const uint32_t GPRCoverageData[] = {
+    // Classes 0-31
+    (1u << ARM::GPRRegClassID) | (1u << ARM::GPRwithAPSRRegClassID) |
+        (1u << ARM::GPRnopcRegClassID) | (1u << ARM::rGPRRegClassID) |
+        (1u << ARM::hGPRRegClassID) | (1u << ARM::tGPRRegClassID) |
+        (1u << ARM::GPRnopc_and_hGPRRegClassID) |
+        (1u << ARM::hGPR_and_rGPRRegClassID) | (1u << ARM::tcGPRRegClassID) |
+        (1u << ARM::tGPR_and_tcGPRRegClassID) | (1u << ARM::GPRspRegClassID) |
+        (1u << ARM::hGPR_and_tcGPRRegClassID),
+    // Classes 32-63
+    0,
+    // Classes 64-96
+    0,
+    // FIXME: Some of the entries below this point can be safely removed once
+    // this is tablegenerated. It's only needed because of the hardcoded
+    // register class limit.
+    // Classes 97-128
+    0,
+    // Classes 129-160
+    0,
+    // Classes 161-192
+    0,
+    // Classes 193-224
+    0,
+};
+
+RegisterBank GPRRegBank(ARM::GPRRegBankID, "GPRB", 32, ARM::GPRCoverageData);
 RegisterBank *RegBanks[] = {&GPRRegBank};
 
 RegisterBankInfo::PartialMapping GPRPartialMapping{0, 32, GPRRegBank};
@@ -51,14 +77,11 @@ ARMRegisterBankInfo::ARMRegisterBankInfo(const TargetRegisterInfo &TRI)
     return;
   AlreadyInit = true;
 
-  // Initialize the GPR bank.
-  createRegisterBank(ARM::GPRRegBankID, "GPRB");
-
-  addRegBankCoverage(ARM::GPRRegBankID, ARM::GPRRegClassID, TRI);
-  addRegBankCoverage(ARM::GPRRegBankID, ARM::GPRwithAPSRRegClassID, TRI);
   const RegisterBank &RBGPR = getRegBank(ARM::GPRRegBankID);
   (void)RBGPR;
   assert(&ARM::GPRRegBank == &RBGPR && "The order in RegBanks is messed up");
+
+  // Initialize the GPR bank.
   assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRRegClassID)) &&
          "Subclass not added?");
   assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRwithAPSRRegClassID)) &&
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index cc001b596785..2b6b36bc3e68 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -433,7 +433,8 @@ int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
 int ARMTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
     TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
-    TTI::OperandValueProperties Opd2PropInfo) {
+    TTI::OperandValueProperties Opd2PropInfo,
+    ArrayRef<const Value *> Args) {
 
   int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h
index 731a5adf3d73..3c83cd92a61a 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -114,7 +114,8 @@ public:
       TTI::OperandValueKind Op1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Op2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
-      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
 
   int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                       unsigned AddressSpace);
diff --git a/lib/Target/Lanai/LanaiTargetTransformInfo.h b/lib/Target/Lanai/LanaiTargetTransformInfo.h
index 7fcb3ce45bbb..d95c16fc3caf 100644
--- a/lib/Target/Lanai/LanaiTargetTransformInfo.h
+++ b/lib/Target/Lanai/LanaiTargetTransformInfo.h
@@ -54,7 +54,8 @@ public:
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
-      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None) {
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>()) {
     int ISD = TLI->InstructionOpcodeToISD(Opcode);
 
     switch (ISD) {
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index 26e0f9a94368..f28e8b36fdbc 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -14,11 +14,13 @@
 #include "MipsMachineFunction.h"
 #include "MipsRegisterInfo.h"
 #include "MipsTargetMachine.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 
@@ -1456,9 +1458,12 @@ static SDValue lowerMSASplatZExt(SDValue Op, unsigned OpNr, SelectionDAG &DAG) {
   return Result;
 }
 
-static SDValue lowerMSASplatImm(SDValue Op, unsigned ImmOp, SelectionDAG &DAG) {
-  return DAG.getConstant(Op->getConstantOperandVal(ImmOp), SDLoc(Op),
-                         Op->getValueType(0));
+static SDValue lowerMSASplatImm(SDValue Op, unsigned ImmOp, SelectionDAG &DAG,
+                                bool IsSigned = false) {
+  return DAG.getConstant(
+      APInt(Op->getValueType(0).getScalarType().getSizeInBits(),
+            Op->getConstantOperandVal(ImmOp), IsSigned),
+      SDLoc(Op), Op->getValueType(0));
 }
 
 static SDValue getBuildVectorSplat(EVT VecTy, SDValue SplatValue,
@@ -1564,8 +1569,8 @@ static SDValue lowerMSABitClearImm(SDValue Op, SelectionDAG &DAG) {
 SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                       SelectionDAG &DAG) const {
   SDLoc DL(Op);
-
-  switch (cast<ConstantSDNode>(Op->getOperand(0))->getZExtValue()) {
+  unsigned Intrinsic = cast<ConstantSDNode>(Op->getOperand(0))->getZExtValue();
+  switch (Intrinsic) {
   default:
     return SDValue();
   case Intrinsic::mips_shilo:
@@ -1635,6 +1640,8 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
     // binsli_x(IfClear, IfSet, nbits) -> (vselect LBitsMask, IfSet, IfClear)
     EVT VecTy = Op->getValueType(0);
     EVT EltTy = VecTy.getVectorElementType();
+    if (Op->getConstantOperandVal(3) >= EltTy.getSizeInBits())
+      report_fatal_error("Immediate out of range");
     APInt Mask = APInt::getHighBitsSet(EltTy.getSizeInBits(),
                                        Op->getConstantOperandVal(3));
     return DAG.getNode(ISD::VSELECT, DL, VecTy,
@@ -1648,6 +1655,8 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
     // binsri_x(IfClear, IfSet, nbits) -> (vselect RBitsMask, IfSet, IfClear)
     EVT VecTy = Op->getValueType(0);
     EVT EltTy = VecTy.getVectorElementType();
+    if (Op->getConstantOperandVal(3) >= EltTy.getSizeInBits())
+      report_fatal_error("Immediate out of range");
     APInt Mask = APInt::getLowBitsSet(EltTy.getSizeInBits(),
                                       Op->getConstantOperandVal(3));
     return DAG.getNode(ISD::VSELECT, DL, VecTy,
@@ -1741,7 +1750,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_ceqi_w:
   case Intrinsic::mips_ceqi_d:
     return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
-                        lowerMSASplatImm(Op, 2, DAG), ISD::SETEQ);
+                        lowerMSASplatImm(Op, 2, DAG, true), ISD::SETEQ);
   case Intrinsic::mips_cle_s_b:
   case Intrinsic::mips_cle_s_h:
   case Intrinsic::mips_cle_s_w:
@@ -1753,7 +1762,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_clei_s_w:
   case Intrinsic::mips_clei_s_d:
     return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
-                        lowerMSASplatImm(Op, 2, DAG), ISD::SETLE);
+                        lowerMSASplatImm(Op, 2, DAG, true), ISD::SETLE);
   case Intrinsic::mips_cle_u_b:
   case Intrinsic::mips_cle_u_h:
   case Intrinsic::mips_cle_u_w:
@@ -1777,7 +1786,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_clti_s_w:
   case Intrinsic::mips_clti_s_d:
     return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
-                        lowerMSASplatImm(Op, 2, DAG), ISD::SETLT);
+                        lowerMSASplatImm(Op, 2, DAG, true), ISD::SETLT);
   case Intrinsic::mips_clt_u_b:
   case Intrinsic::mips_clt_u_h:
   case Intrinsic::mips_clt_u_w:
@@ -1990,15 +1999,28 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_insve_b:
   case Intrinsic::mips_insve_h:
   case Intrinsic::mips_insve_w:
-  case Intrinsic::mips_insve_d:
+  case Intrinsic::mips_insve_d: {
+    // Report an error for out of range values.
+    int64_t Max;
+    switch (Intrinsic) {
+    case Intrinsic::mips_insve_b: Max = 15; break;
+    case Intrinsic::mips_insve_h: Max = 7; break;
+    case Intrinsic::mips_insve_w: Max = 3; break;
+    case Intrinsic::mips_insve_d: Max = 1; break;
+    default: llvm_unreachable("Unmatched intrinsic");
+    }
+    int64_t Value = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
+    if (Value < 0 || Value > Max)
+      report_fatal_error("Immediate out of range");
     return DAG.getNode(MipsISD::INSVE, DL, Op->getValueType(0),
                        Op->getOperand(1), Op->getOperand(2), Op->getOperand(3),
                        DAG.getConstant(0, DL, MVT::i32));
+    }
   case Intrinsic::mips_ldi_b:
   case Intrinsic::mips_ldi_h:
   case Intrinsic::mips_ldi_w:
   case Intrinsic::mips_ldi_d:
-    return lowerMSASplatImm(Op, 1, DAG);
+    return lowerMSASplatImm(Op, 1, DAG, true);
   case Intrinsic::mips_lsa:
   case Intrinsic::mips_dlsa: {
     EVT ResTy = Op->getValueType(0);
@@ -2032,7 +2054,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_maxi_s_w:
   case Intrinsic::mips_maxi_s_d:
     return DAG.getNode(MipsISD::VSMAX, DL, Op->getValueType(0),
-                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG, true));
   case Intrinsic::mips_maxi_u_b:
   case Intrinsic::mips_maxi_u_h:
   case Intrinsic::mips_maxi_u_w:
@@ -2056,7 +2078,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_mini_s_w:
   case Intrinsic::mips_mini_s_d:
     return DAG.getNode(MipsISD::VSMIN, DL, Op->getValueType(0),
-                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG, true));
   case Intrinsic::mips_mini_u_b:
   case Intrinsic::mips_mini_u_h:
   case Intrinsic::mips_mini_u_w:
@@ -2129,11 +2151,59 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_pcnt_w:
   case Intrinsic::mips_pcnt_d:
     return DAG.getNode(ISD::CTPOP, DL, Op->getValueType(0), Op->getOperand(1));
+  case Intrinsic::mips_sat_s_b:
+  case Intrinsic::mips_sat_s_h:
+  case Intrinsic::mips_sat_s_w:
+  case Intrinsic::mips_sat_s_d:
+  case Intrinsic::mips_sat_u_b:
+  case Intrinsic::mips_sat_u_h:
+  case Intrinsic::mips_sat_u_w:
+  case Intrinsic::mips_sat_u_d: {
+    // Report an error for out of range values.
+    int64_t Max;
+    switch (Intrinsic) {
+    case Intrinsic::mips_sat_s_b:
+    case Intrinsic::mips_sat_u_b: Max = 7;  break;
+    case Intrinsic::mips_sat_s_h:
+    case Intrinsic::mips_sat_u_h: Max = 15; break;
+    case Intrinsic::mips_sat_s_w:
+    case Intrinsic::mips_sat_u_w: Max = 31; break;
+    case Intrinsic::mips_sat_s_d:
+    case Intrinsic::mips_sat_u_d: Max = 63; break;
+    default: llvm_unreachable("Unmatched intrinsic");
+    }
+    int64_t Value = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
+    if (Value < 0 || Value > Max)
+      report_fatal_error("Immediate out of range");
+    return SDValue();
+  }
   case Intrinsic::mips_shf_b:
   case Intrinsic::mips_shf_h:
-  case Intrinsic::mips_shf_w:
+  case Intrinsic::mips_shf_w: {
+    int64_t Value = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
+    if (Value < 0 || Value > 255)
+      report_fatal_error("Immediate out of range");
     return DAG.getNode(MipsISD::SHF, DL, Op->getValueType(0),
                        Op->getOperand(2), Op->getOperand(1));
+  }
+  case Intrinsic::mips_sldi_b:
+  case Intrinsic::mips_sldi_h:
+  case Intrinsic::mips_sldi_w:
+  case Intrinsic::mips_sldi_d: {
+    // Report an error for out of range values.
+    int64_t Max;
+    switch (Intrinsic) {
+    case Intrinsic::mips_sldi_b: Max = 15; break;
+    case Intrinsic::mips_sldi_h: Max = 7; break;
+    case Intrinsic::mips_sldi_w: Max = 3; break;
+    case Intrinsic::mips_sldi_d: Max = 1; break;
+    default: llvm_unreachable("Unmatched intrinsic");
+    }
+    int64_t Value = cast<ConstantSDNode>(Op->getOperand(3))->getSExtValue();
+    if (Value < 0 || Value > Max)
+      report_fatal_error("Immediate out of range");
+    return SDValue();
+  }
   case Intrinsic::mips_sll_b:
   case Intrinsic::mips_sll_h:
   case Intrinsic::mips_sll_w:
@@ -2176,6 +2246,24 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_srai_d:
     return DAG.getNode(ISD::SRA, DL, Op->getValueType(0),
                        Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_srari_b:
+  case Intrinsic::mips_srari_h:
+  case Intrinsic::mips_srari_w:
+  case Intrinsic::mips_srari_d: {
+    // Report an error for out of range values.
+    int64_t Max;
+    switch (Intrinsic) {
+    case Intrinsic::mips_srari_b: Max = 7; break;
+    case Intrinsic::mips_srari_h: Max = 15; break;
+    case Intrinsic::mips_srari_w: Max = 31; break;
+    case Intrinsic::mips_srari_d: Max = 63; break;
+    default: llvm_unreachable("Unmatched intrinsic");
+    }
+    int64_t Value = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
+    if (Value < 0 || Value > Max)
+      report_fatal_error("Immediate out of range");
+    return SDValue();
+  }
   case Intrinsic::mips_srl_b:
   case Intrinsic::mips_srl_h:
   case Intrinsic::mips_srl_w:
@@ -2188,6 +2276,24 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_srli_d:
     return DAG.getNode(ISD::SRL, DL, Op->getValueType(0),
                        Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_srlri_b:
+  case Intrinsic::mips_srlri_h:
+  case Intrinsic::mips_srlri_w:
+  case Intrinsic::mips_srlri_d: {
+    // Report an error for out of range values.
+    int64_t Max;
+    switch (Intrinsic) {
+    case Intrinsic::mips_srlri_b: Max = 7; break;
+    case Intrinsic::mips_srlri_h: Max = 15; break;
+    case Intrinsic::mips_srlri_w: Max = 31; break;
+    case Intrinsic::mips_srlri_d: Max = 63; break;
+    default: llvm_unreachable("Unmatched intrinsic");
+    }
+    int64_t Value = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
+    if (Value < 0 || Value > Max)
+      report_fatal_error("Immediate out of range");
+    return SDValue();
+  }
   case Intrinsic::mips_subv_b:
   case Intrinsic::mips_subv_h:
   case Intrinsic::mips_subv_w:
@@ -2219,7 +2325,8 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   }
 }
 
-static SDValue lowerMSALoadIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) {
+static SDValue lowerMSALoadIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr,
+                                const MipsSubtarget &Subtarget) {
   SDLoc DL(Op);
   SDValue ChainIn = Op->getOperand(0);
   SDValue Address = Op->getOperand(2);
@@ -2227,6 +2334,12 @@ static SDValue lowerMSALoadIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) {
   EVT ResTy = Op->getValueType(0);
   EVT PtrTy = Address->getValueType(0);
 
+  // For N64 addresses have the underlying type MVT::i64. This intrinsic
+  // however takes an i32 signed constant offset. The actual type of the
+  // intrinsic is a scaled signed i10.
+  if (Subtarget.isABI_N64())
+    Offset = DAG.getNode(ISD::SIGN_EXTEND, DL, PtrTy, Offset);
+
   Address = DAG.getNode(ISD::ADD, DL, PtrTy, Address, Offset);
   return DAG.getLoad(ResTy, DL, ChainIn, Address, MachinePointerInfo(),
                      /* Alignment = */ 16);
@@ -2282,11 +2395,12 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
   case Intrinsic::mips_ld_h:
   case Intrinsic::mips_ld_w:
   case Intrinsic::mips_ld_d:
-   return lowerMSALoadIntr(Op, DAG, Intr);
+   return lowerMSALoadIntr(Op, DAG, Intr, Subtarget);
   }
 }
 
-static SDValue lowerMSAStoreIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) {
+static SDValue lowerMSAStoreIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr,
+                                 const MipsSubtarget &Subtarget) {
   SDLoc DL(Op);
   SDValue ChainIn = Op->getOperand(0);
   SDValue Value   = Op->getOperand(2);
@@ -2294,6 +2408,12 @@ static SDValue lowerMSAStoreIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) {
   SDValue Offset  = Op->getOperand(4);
   EVT PtrTy = Address->getValueType(0);
 
+  // For N64 addresses have the underlying type MVT::i64. This intrinsic
+  // however takes an i32 signed constant offset. The actual type of the
+  // intrinsic is a scaled signed i10.
+  if (Subtarget.isABI_N64())
+    Offset = DAG.getNode(ISD::SIGN_EXTEND, DL, PtrTy, Offset);
+
   Address = DAG.getNode(ISD::ADD, DL, PtrTy, Address, Offset);
 
   return DAG.getStore(ChainIn, DL, Value, Address, MachinePointerInfo(),
@@ -2310,7 +2430,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_VOID(SDValue Op,
   case Intrinsic::mips_st_h:
   case Intrinsic::mips_st_w:
   case Intrinsic::mips_st_d:
-    return lowerMSAStoreIntr(Op, DAG, Intr);
+    return lowerMSAStoreIntr(Op, DAG, Intr, Subtarget);
   }
 }
 
@@ -3377,8 +3497,12 @@ MipsSETargetLowering::emitFILL_FW(MachineInstr &MI,
   DebugLoc DL = MI.getDebugLoc();
   unsigned Wd = MI.getOperand(0).getReg();
   unsigned Fs = MI.getOperand(1).getReg();
-  unsigned Wt1 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
-  unsigned Wt2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+  unsigned Wt1 = RegInfo.createVirtualRegister(
+      Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass
+                              : &Mips::MSA128WEvensRegClass);
+  unsigned Wt2 = RegInfo.createVirtualRegister(
+      Subtarget.useOddSPReg() ? &Mips::MSA128WRegClass
+                              : &Mips::MSA128WEvensRegClass);
 
   BuildMI(*BB, MI, DL, TII->get(Mips::IMPLICIT_DEF), Wt1);
   BuildMI(*BB, MI, DL, TII->get(Mips::INSERT_SUBREG), Wt2)
diff --git a/lib/Target/NVPTX/ManagedStringPool.h b/lib/Target/NVPTX/ManagedStringPool.h
index a2d670f8d39d..7fc0156216f5 100644
--- a/lib/Target/NVPTX/ManagedStringPool.h
+++ b/lib/Target/NVPTX/ManagedStringPool.h
@@ -27,7 +27,8 @@ class ManagedStringPool {
   SmallVector<std::string *, 8> Pool;
 
 public:
-  ManagedStringPool() {}
+  ManagedStringPool() = default;
+
   ~ManagedStringPool() {
     SmallVectorImpl<std::string *>::iterator Current = Pool.begin();
     while (Current != Pool.end()) {
@@ -43,6 +44,6 @@ public:
   }
 };
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_NVPTX_MANAGEDSTRINGPOOL_H
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 04c8d5c0443e..3c2594c77f45 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -12,42 +12,83 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "NVPTXAsmPrinter.h"
 #include "InstPrinter/NVPTXInstPrinter.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
 #include "MCTargetDesc/NVPTXMCAsmInfo.h"
 #include "NVPTX.h"
-#include "NVPTXInstrInfo.h"
+#include "NVPTXAsmPrinter.h"
 #include "NVPTXMCExpr.h"
 #include "NVPTXMachineFunctionInfo.h"
 #include "NVPTXRegisterInfo.h"
+#include "NVPTXSubtarget.h"
 #include "NVPTXTargetMachine.h"
 #include "NVPTXUtilities.h"
 #include "cl_common_defines.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <new>
 #include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
 using namespace llvm;
 
 #define DEPOTNAME "__local_depot"
@@ -62,11 +103,11 @@ InterleaveSrc("nvptx-emit-src", cl::ZeroOrMore, cl::Hidden,
               cl::desc("NVPTX Specific: Emit source line in ptx file"),
               cl::init(false));
 
-namespace {
 /// DiscoverDependentGlobals - Return a set of GlobalVariables on which \p V
 /// depends.
-void DiscoverDependentGlobals(const Value *V,
-                              DenseSet<const GlobalVariable *> &Globals) {
+static void
+DiscoverDependentGlobals(const Value *V,
+                         DenseSet<const GlobalVariable *> &Globals) {
   if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
     Globals.insert(GV);
   else {
@@ -80,11 +121,12 @@ void DiscoverDependentGlobals(const Value *V,
 
 /// VisitGlobalVariableForEmission - Add \p GV to the list of GlobalVariable
 /// instances to be emitted, but only after any dependents have been added
-/// first.
-void VisitGlobalVariableForEmission(
-    const GlobalVariable *GV, SmallVectorImpl<const GlobalVariable *> &Order,
-    DenseSet<const GlobalVariable *> &Visited,
-    DenseSet<const GlobalVariable *> &Visiting) {
+/// first.s
+static void
+VisitGlobalVariableForEmission(const GlobalVariable *GV,
+                               SmallVectorImpl<const GlobalVariable *> &Order,
+                               DenseSet<const GlobalVariable *> &Visited,
+                               DenseSet<const GlobalVariable *> &Visiting) {
   // Have we already visited this one?
   if (Visited.count(GV))
     return;
@@ -108,7 +150,6 @@ void VisitGlobalVariableForEmission(
   Visited.insert(GV);
   Visiting.erase(GV);
 }
-}
 
 void NVPTXAsmPrinter::emitLineNumberAsDotLoc(const MachineInstr &MI) {
   if (!EmitLineNumbers)
@@ -369,7 +410,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
     } else if (Ty->isAggregateType() || Ty->isVectorTy()) {
       unsigned totalsz = DL.getTypeAllocSize(Ty);
       unsigned retAlignment = 0;
-      if (!llvm::getAlign(*F, 0, retAlignment))
+      if (!getAlign(*F, 0, retAlignment))
         retAlignment = DL.getABITypeAlignment(Ty);
       O << ".param .align " << retAlignment << " .b8 func_retval0[" << totalsz
         << "]";
@@ -401,7 +442,6 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
     }
   }
   O << ") ";
-  return;
 }
 
 void NVPTXAsmPrinter::printReturnValStr(const MachineFunction &MF,
@@ -459,7 +499,7 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
   MRI = &MF->getRegInfo();
   F = MF->getFunction();
   emitLinkageDirective(F, O);
-  if (llvm::isKernelFunction(*F))
+  if (isKernelFunction(*F))
     O << ".entry ";
   else {
     O << ".func ";
@@ -470,7 +510,7 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
 
   emitFunctionParamList(*MF, O);
 
-  if (llvm::isKernelFunction(*F))
+  if (isKernelFunction(*F))
     emitKernelFunctionDirectives(*F, O);
 
   OutStreamer->EmitRawText(O.str());
@@ -513,15 +553,15 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
   // If none of reqntid* is specified, don't output reqntid directive.
   unsigned reqntidx, reqntidy, reqntidz;
   bool specified = false;
-  if (!llvm::getReqNTIDx(F, reqntidx))
+  if (!getReqNTIDx(F, reqntidx))
     reqntidx = 1;
   else
     specified = true;
-  if (!llvm::getReqNTIDy(F, reqntidy))
+  if (!getReqNTIDy(F, reqntidy))
     reqntidy = 1;
   else
     specified = true;
-  if (!llvm::getReqNTIDz(F, reqntidz))
+  if (!getReqNTIDz(F, reqntidz))
     reqntidz = 1;
   else
     specified = true;
@@ -535,15 +575,15 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
   // If none of maxntid* is specified, don't output maxntid directive.
   unsigned maxntidx, maxntidy, maxntidz;
   specified = false;
-  if (!llvm::getMaxNTIDx(F, maxntidx))
+  if (!getMaxNTIDx(F, maxntidx))
     maxntidx = 1;
   else
     specified = true;
-  if (!llvm::getMaxNTIDy(F, maxntidy))
+  if (!getMaxNTIDy(F, maxntidy))
     maxntidy = 1;
   else
     specified = true;
-  if (!llvm::getMaxNTIDz(F, maxntidz))
+  if (!getMaxNTIDz(F, maxntidz))
     maxntidz = 1;
   else
     specified = true;
@@ -553,11 +593,11 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
       << "\n";
 
   unsigned mincta;
-  if (llvm::getMinCTASm(F, mincta))
+  if (getMinCTASm(F, mincta))
     O << ".minnctapersm " << mincta << "\n";
 
   unsigned maxnreg;
-  if (llvm::getMaxNReg(F, maxnreg))
+  if (getMaxNReg(F, maxnreg))
     O << ".maxnreg " << maxnreg << "\n";
 }
 
@@ -617,12 +657,9 @@ void NVPTXAsmPrinter::printVecModifiedImmediate(
     llvm_unreachable("Unknown Modifier on immediate operand");
 }
 
-
-
 void NVPTXAsmPrinter::emitDeclaration(const Function *F, raw_ostream &O) {
-
   emitLinkageDirective(F, O);
-  if (llvm::isKernelFunction(*F))
+  if (isKernelFunction(*F))
     O << ".entry ";
   else
     O << ".func ";
@@ -684,7 +721,7 @@ static bool canDemoteGlobalVar(const GlobalVariable *gv, Function const *&f) {
   if (!gv->hasInternalLinkage())
     return false;
   PointerType *Pty = gv->getType();
-  if (Pty->getAddressSpace() != llvm::ADDRESS_SPACE_SHARED)
+  if (Pty->getAddressSpace() != ADDRESS_SPACE_SHARED)
     return false;
 
   const Function *oneFunc = nullptr;
@@ -699,7 +736,7 @@ static bool canDemoteGlobalVar(const GlobalVariable *gv, Function const *&f) {
 }
 
 static bool useFuncSeen(const Constant *C,
-                        llvm::DenseMap<const Function *, bool> &seenMap) {
+                        DenseMap<const Function *, bool> &seenMap) {
   for (const User *U : C->users()) {
     if (const Constant *cu = dyn_cast<Constant>(U)) {
       if (useFuncSeen(cu, seenMap))
@@ -719,7 +756,7 @@ static bool useFuncSeen(const Constant *C,
 }
 
 void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) {
-  llvm::DenseMap<const Function *, bool> seenMap;
+  DenseMap<const Function *, bool> seenMap;
   for (Module::const_iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) {
     const Function *F = &*FI;
 
@@ -1040,7 +1077,6 @@ void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V,
 void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
                                          raw_ostream &O,
                                          bool processDemoted) {
-
   // Skip meta data
   if (GVar->hasSection()) {
     if (GVar->getSection() == "llvm.metadata")
@@ -1069,13 +1105,13 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
     O << ".weak ";
   }
 
-  if (llvm::isTexture(*GVar)) {
-    O << ".global .texref " << llvm::getTextureName(*GVar) << ";\n";
+  if (isTexture(*GVar)) {
+    O << ".global .texref " << getTextureName(*GVar) << ";\n";
     return;
   }
 
-  if (llvm::isSurface(*GVar)) {
-    O << ".global .surfref " << llvm::getSurfaceName(*GVar) << ";\n";
+  if (isSurface(*GVar)) {
+    O << ".global .surfref " << getSurfaceName(*GVar) << ";\n";
     return;
   }
 
@@ -1088,8 +1124,8 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
     return;
   }
 
-  if (llvm::isSampler(*GVar)) {
-    O << ".global .samplerref " << llvm::getSamplerName(*GVar);
+  if (isSampler(*GVar)) {
+    O << ".global .samplerref " << getSamplerName(*GVar);
 
     const Constant *Initializer = nullptr;
     if (GVar->hasInitializer())
@@ -1150,12 +1186,11 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
   }
 
   if (GVar->hasPrivateLinkage()) {
-
-    if (!strncmp(GVar->getName().data(), "unrollpragma", 12))
+    if (strncmp(GVar->getName().data(), "unrollpragma", 12) == 0)
       return;
 
     // FIXME - need better way (e.g. Metadata) to avoid generating this global
-    if (!strncmp(GVar->getName().data(), "filename", 8))
+    if (strncmp(GVar->getName().data(), "filename", 8) == 0)
       return;
     if (GVar->use_empty())
       return;
@@ -1199,8 +1234,8 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
     // Ptx allows variable initilization only for constant and global state
     // spaces.
     if (GVar->hasInitializer()) {
-      if ((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
-          (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) {
+      if ((PTy->getAddressSpace() == ADDRESS_SPACE_GLOBAL) ||
+          (PTy->getAddressSpace() == ADDRESS_SPACE_CONST)) {
         const Constant *Initializer = GVar->getInitializer();
         // 'undef' is treated as there is no value specified.
         if (!Initializer->isNullValue() && !isa<UndefValue>(Initializer)) {
@@ -1233,8 +1268,8 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
       ElementSize = DL.getTypeStoreSize(ETy);
       // Ptx allows variable initilization only for constant and
       // global state spaces.
-      if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
-           (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) &&
+      if (((PTy->getAddressSpace() == ADDRESS_SPACE_GLOBAL) ||
+           (PTy->getAddressSpace() == ADDRESS_SPACE_CONST)) &&
           GVar->hasInitializer()) {
         const Constant *Initializer = GVar->getInitializer();
         if (!isa<UndefValue>(Initializer) && !Initializer->isNullValue()) {
@@ -1285,7 +1320,6 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
     default:
       llvm_unreachable("type not supported yet");
     }
-
   }
   O << ";\n";
 }
@@ -1305,16 +1339,16 @@ void NVPTXAsmPrinter::emitDemotedVars(const Function *f, raw_ostream &O) {
 void NVPTXAsmPrinter::emitPTXAddressSpace(unsigned int AddressSpace,
                                           raw_ostream &O) const {
   switch (AddressSpace) {
-  case llvm::ADDRESS_SPACE_LOCAL:
+  case ADDRESS_SPACE_LOCAL:
     O << "local";
     break;
-  case llvm::ADDRESS_SPACE_GLOBAL:
+  case ADDRESS_SPACE_GLOBAL:
     O << "global";
     break;
-  case llvm::ADDRESS_SPACE_CONST:
+  case ADDRESS_SPACE_CONST:
     O << "const";
     break;
-  case llvm::ADDRESS_SPACE_SHARED:
+  case ADDRESS_SPACE_SHARED:
     O << "shared";
     break;
   default:
@@ -1363,7 +1397,6 @@ NVPTXAsmPrinter::getPTXFundamentalTypeStr(Type *Ty, bool useB4PTR) const {
 
 void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
                                             raw_ostream &O) {
-
   const DataLayout &DL = getDataLayout();
 
   // GlobalVariables are always constant pointers themselves.
@@ -1406,7 +1439,6 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
   default:
     llvm_unreachable("type not supported yet");
   }
-  return;
 }
 
 static unsigned int getOpenCLAlignment(const DataLayout &DL, Type *Ty) {
@@ -1450,7 +1482,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
   Function::const_arg_iterator I, E;
   unsigned paramIndex = 0;
   bool first = true;
-  bool isKernelFunc = llvm::isKernelFunction(*F);
+  bool isKernelFunc = isKernelFunction(*F);
   bool isABI = (nvptxSubtarget->getSmVersion() >= 20);
   MVT thePointerTy = TLI->getPointerTy(DL);
 
@@ -1533,13 +1565,13 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
             default:
               O << ".ptr ";
               break;
-            case llvm::ADDRESS_SPACE_CONST:
+            case ADDRESS_SPACE_CONST:
               O << ".ptr .const ";
               break;
-            case llvm::ADDRESS_SPACE_SHARED:
+            case ADDRESS_SPACE_SHARED:
               O << ".ptr .shared ";
               break;
-            case llvm::ADDRESS_SPACE_GLOBAL:
+            case ADDRESS_SPACE_GLOBAL:
               O << ".ptr .global ";
               break;
             }
@@ -1820,7 +1852,6 @@ static void ConvertDoubleToBytes(unsigned char *p, double val) {
 
 void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
                                    AggBuffer *aggBuffer) {
-
   const DataLayout &DL = getDataLayout();
 
   if (isa<UndefValue>(CPV) || CPV->isNullValue()) {
@@ -1985,7 +2016,6 @@ void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV,
 // buildTypeNameMap - Run through symbol table looking for type names.
 //
 
-
 bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
@@ -2100,7 +2130,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric)
     raw_string_ostream OS(S);
     OS << "Unsupported expression in static initializer: ";
     CE->printAsOperand(OS, /*PrintType=*/ false,
-                       !MF ? 0 : MF->getFunction()->getParent());
+                       !MF ? nullptr : MF->getFunction()->getParent());
     report_fatal_error(OS.str());
   }
 
@@ -2330,7 +2360,7 @@ void NVPTXAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
                                       raw_ostream &O, const char *Modifier) {
   printOperand(MI, opNum, O);
 
-  if (Modifier && !strcmp(Modifier, "add")) {
+  if (Modifier && strcmp(Modifier, "add") == 0) {
     O << ", ";
     printOperand(MI, opNum + 1, O);
   } else {
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 3dcc0e358a14..8ec3476b8719 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -1,4 +1,4 @@
-//===-- NVPTXAsmPrinter.h - NVPTX LLVM assembly writer --------------------===//
+//===-- NVPTXAsmPrinter.h - NVPTX LLVM assembly writer ----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -18,17 +18,34 @@
 #include "NVPTX.h"
 #include "NVPTXSubtarget.h"
 #include "NVPTXTargetMachine.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Value.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/FormattedStream.h"
+#include "llvm/PassAnalysisSupport.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+#include <cassert>
 #include <fstream>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
 
 // The ptx syntax and format is very different from that usually seem in a .s
 // file,
@@ -40,7 +57,8 @@
 // (subclass of MCStreamer).
 
 namespace llvm {
-  class MCOperand;
+
+class MCOperand;
 
 class LineReader {
 private:
@@ -49,14 +67,17 @@ private:
   char buff[512];
   std::string theFileName;
   SmallVector<unsigned, 32> lineOffset;
+
 public:
   LineReader(std::string filename) {
     theCurLine = 0;
     fstr.open(filename.c_str());
     theFileName = filename;
   }
-  std::string fileName() { return theFileName; }
+
   ~LineReader() { fstr.close(); }
+
+  std::string fileName() { return theFileName; }
   std::string readLine(unsigned line);
 };
 
@@ -107,6 +128,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
       numSymbols = 0;
       EmitGeneric = AP.EmitGeneric;
     }
+
     unsigned addBytes(unsigned char *Ptr, int Num, int Bytes) {
       assert((curpos + Num) <= size);
       assert((curpos + Bytes) <= size);
@@ -120,6 +142,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
       }
       return curpos;
     }
+
     unsigned addZeros(int Num) {
       assert((curpos + Num) <= size);
       for (int i = 0; i < Num; ++i) {
@@ -128,12 +151,14 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
       }
       return curpos;
     }
+
     void addSymbol(const Value *GVar, const Value *GVarBeforeStripping) {
       symbolPosInBuffer.push_back(curpos);
       Symbols.push_back(GVar);
       SymbolsBeforeStripping.push_back(GVarBeforeStripping);
       numSymbols++;
     }
+
     void print() {
       if (numSymbols == 0) {
         // print out in bytes
@@ -267,7 +292,7 @@ private:
   std::map<Type *, std::string> TypeNameMap;
 
   // List of variables demoted to a function scope.
-  std::map<const Function *, std::vector<const GlobalVariable *> > localDecls;
+  std::map<const Function *, std::vector<const GlobalVariable *>> localDecls;
 
   // To record filename to ID mapping
   std::map<std::string, unsigned> filenameMap;
@@ -292,7 +317,8 @@ private:
 
   bool isLoopHeaderOfNoUnroll(const MachineBasicBlock &MBB) const;
 
-  LineReader *reader;
+  LineReader *reader = nullptr;
+
   LineReader *getReader(const std::string &);
 
   // Used to control the need to emit .generic() in the initializer of
@@ -312,20 +338,17 @@ public:
   NVPTXAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
       : AsmPrinter(TM, std::move(Streamer)),
         EmitGeneric(static_cast<NVPTXTargetMachine &>(TM).getDrvInterface() ==
-                    NVPTX::CUDA) {
-    CurrentBankselLabelInBasicBlock = "";
-    reader = nullptr;
-  }
+                    NVPTX::CUDA) {}
 
-  ~NVPTXAsmPrinter() {
-    if (!reader)
-      delete reader;
+  ~NVPTXAsmPrinter() override {
+    delete reader;
   }
 
   bool runOnMachineFunction(MachineFunction &F) override {
     nvptxSubtarget = &F.getSubtarget<NVPTXSubtarget>();
     return AsmPrinter::runOnMachineFunction(F);
   }
+
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineLoopInfo>();
     AsmPrinter::getAnalysisUsage(AU);
@@ -338,6 +361,7 @@ public:
   DebugLoc prevDebugLoc;
   void emitLineNumberAsDotLoc(const MachineInstr &);
 };
-} // end of namespace
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_NVPTX_NVPTXASMPRINTER_H
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 2e4764feff11..7a760fd38d0f 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1,3 +1,4 @@
+//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,31 +12,55 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "NVPTXISelLowering.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
 #include "NVPTX.h"
+#include "NVPTXISelLowering.h"
+#include "NVPTXSection.h"
+#include "NVPTXSubtarget.h"
 #include "NVPTXTargetMachine.h"
 #include "NVPTXTargetObjectFile.h"
 #include "NVPTXUtilities.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/Analysis.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
-#include "llvm/MC/MCSectionELF.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetCallingConv.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
 #include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
 
 #undef DEBUG_TYPE
 #define DEBUG_TYPE "nvptx-lower"
@@ -109,7 +134,6 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
 NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
                                          const NVPTXSubtarget &STI)
     : TargetLowering(TM), nvTM(&TM), STI(STI) {
-
   // always lower memset, memcpy, and memmove intrinsics to load/store
   // instructions, rather
   // then generating calls to memset, mempcy or memmove.
@@ -981,7 +1005,7 @@ std::string NVPTXTargetLowering::getPrototype(
         unsigned align = 0;
         const CallInst *CallI = cast<CallInst>(CS->getInstruction());
         // +1 because index 0 is reserved for return type alignment
-        if (!llvm::getAlign(*CallI, i + 1, align))
+        if (!getAlign(*CallI, i + 1, align))
           align = DL.getABITypeAlignment(Ty);
         unsigned sz = DL.getTypeAllocSize(Ty);
         O << ".param .align " << align << " .b8 ";
@@ -1047,7 +1071,7 @@ unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
     // With bitcast'd call targets, the instruction will be the call
     if (isa<CallInst>(CalleeI)) {
       // Check if we have call alignment metadata
-      if (llvm::getAlign(*cast<CallInst>(CalleeI), Idx, Align))
+      if (getAlign(*cast<CallInst>(CalleeI), Idx, Align))
         return Align;
 
       const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
@@ -1070,7 +1094,7 @@ unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
   // Check for function alignment information if we found that the
   // ultimate target is a Function
   if (DirectCallee)
-    if (llvm::getAlign(*cast<Function>(DirectCallee), Idx, Align))
+    if (getAlign(*cast<Function>(DirectCallee), Idx, Align))
       return Align;
 
   // Call is indirect or alignment information is not available, fall back to
@@ -1747,7 +1771,6 @@ SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
 
   if (VTBits == 32 && STI.getSmVersion() >= 35) {
-
     // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
     // {dHi, dLo} = {aHi, aLo} >> Amt
     //   dHi = aHi >> Amt
@@ -1761,7 +1784,6 @@ SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
     return DAG.getMergeValues(Ops, dl);
   }
   else {
-
     // {dHi, dLo} = {aHi, aLo} >> Amt
     // - if (Amt>=size) then
     //      dLo = aHi >> (Amt-size)
@@ -1809,7 +1831,6 @@ SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
   SDValue ShAmt  = Op.getOperand(2);
 
   if (VTBits == 32 && STI.getSmVersion() >= 35) {
-
     // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
     // {dHi, dLo} = {aHi, aLo} << Amt
     //   dHi = shf.l.clamp aLo, aHi, Amt
@@ -1823,7 +1844,6 @@ SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
     return DAG.getMergeValues(Ops, dl);
   }
   else {
-
     // {dHi, dLo} = {aHi, aLo} << Amt
     // - if (Amt>=size) then
     //      dLo = aLo << Amt (all 0)
@@ -2002,11 +2022,10 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
     case 2:
       Opcode = NVPTXISD::StoreV2;
       break;
-    case 4: {
+    case 4:
       Opcode = NVPTXISD::StoreV4;
       break;
     }
-    }
 
     SmallVector<SDValue, 8> Ops;
 
@@ -2140,7 +2159,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
             theArgs[i],
             (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
                                      : nullptr))) {
-      assert(llvm::isKernelFunction(*F) &&
+      assert(isKernelFunction(*F) &&
              "Only kernels can have image/sampler params");
       InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
       continue;
@@ -2193,7 +2212,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
                            0);
         assert(vtparts.size() > 0 && "empty aggregate type not expected");
         bool aggregateIsPacked = false;
-        if (StructType *STy = llvm::dyn_cast<StructType>(Ty))
+        if (StructType *STy = dyn_cast<StructType>(Ty))
           aggregateIsPacked = STy->isPacked();
 
         SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
@@ -2202,7 +2221,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
           EVT partVT = vtparts[parti];
           Value *srcValue = Constant::getNullValue(
               PointerType::get(partVT.getTypeForEVT(F->getContext()),
-                               llvm::ADDRESS_SPACE_PARAM));
+                               ADDRESS_SPACE_PARAM));
           SDValue srcAddr =
               DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
                           DAG.getConstant(offsets[parti], dl, PtrVT));
@@ -2242,7 +2261,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
         if (NumElts == 1) {
           // We only have one element, so just directly load it
           Value *SrcValue = Constant::getNullValue(PointerType::get(
-              EltVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
+              EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
           SDValue P = DAG.getLoad(
               EltVT, dl, Root, Arg, MachinePointerInfo(SrcValue),
               DL.getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())),
@@ -2260,7 +2279,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
           // f32,f32 = load ...
           EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, 2);
           Value *SrcValue = Constant::getNullValue(PointerType::get(
-              VecVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
+              VecVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
           SDValue P = DAG.getLoad(
               VecVT, dl, Root, Arg, MachinePointerInfo(SrcValue),
               DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())),
@@ -2301,7 +2320,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
           for (unsigned i = 0; i < NumElts; i += VecSize) {
             Value *SrcValue = Constant::getNullValue(
                 PointerType::get(VecVT.getTypeForEVT(F->getContext()),
-                                 llvm::ADDRESS_SPACE_PARAM));
+                                 ADDRESS_SPACE_PARAM));
             SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
                                           DAG.getConstant(Ofst, dl, PtrVT));
             SDValue P = DAG.getLoad(
@@ -2335,7 +2354,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
       // If ABI, load from the param symbol
       SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
       Value *srcValue = Constant::getNullValue(PointerType::get(
-          ObjectVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
+          ObjectVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
       SDValue p;
        if (ObjectVT.getSizeInBits() < Ins[InsIdx].VT.getSizeInBits()) {
         ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? 
@@ -2424,7 +2443,6 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
       Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
                                       DAG.getVTList(MVT::Other), Ops,
                                       EltVT, MachinePointerInfo());
-
     } else if (NumElts == 2) {
       // V2 store
       SDValue StoreVal0 = OutVals[0];
@@ -2558,7 +2576,6 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
 }
 
-
 void NVPTXTargetLowering::LowerAsmOperandForConstraint(
     SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
     SelectionDAG &DAG) const {
@@ -3306,7 +3323,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.memVT = getValueType(DL, I.getType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Info.vol = 0;
+    Info.vol = false;
     Info.readMem = true;
     Info.writeMem = true;
     Info.align = 0;
@@ -3326,7 +3343,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
       Info.memVT = getValueType(DL, I.getType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Info.vol = 0;
+    Info.vol = false;
     Info.readMem = true;
     Info.writeMem = false;
     Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
@@ -3347,7 +3364,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
       Info.memVT = getValueType(DL, I.getType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Info.vol = 0;
+    Info.vol = false;
     Info.readMem = true;
     Info.writeMem = false;
     Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
@@ -3410,17 +3427,17 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
   case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
   case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
-  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: {
+  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
     Info.opc = getOpcForTextureInstr(Intrinsic);
     Info.memVT = MVT::v4f32;
     Info.ptrVal = nullptr;
     Info.offset = 0;
-    Info.vol = 0;
+    Info.vol = false;
     Info.readMem = true;
     Info.writeMem = false;
     Info.align = 16;
     return true;
-  }
+
   case Intrinsic::nvvm_tex_1d_v4s32_s32:
   case Intrinsic::nvvm_tex_1d_v4s32_f32:
   case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
@@ -3532,17 +3549,17 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
   case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
   case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
-  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: {
+  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
     Info.opc = getOpcForTextureInstr(Intrinsic);
     Info.memVT = MVT::v4i32;
     Info.ptrVal = nullptr;
     Info.offset = 0;
-    Info.vol = 0;
+    Info.vol = false;
     Info.readMem = true;
     Info.writeMem = false;
     Info.align = 16;
     return true;
-  }
+
   case Intrinsic::nvvm_suld_1d_i8_clamp:
   case Intrinsic::nvvm_suld_1d_v2i8_clamp:
   case Intrinsic::nvvm_suld_1d_v4i8_clamp:
@@ -3587,17 +3604,17 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
   case Intrinsic::nvvm_suld_3d_i8_zero:
   case Intrinsic::nvvm_suld_3d_v2i8_zero:
-  case Intrinsic::nvvm_suld_3d_v4i8_zero: {
+  case Intrinsic::nvvm_suld_3d_v4i8_zero:
     Info.opc = getOpcForSurfaceInstr(Intrinsic);
     Info.memVT = MVT::i8;
     Info.ptrVal = nullptr;
     Info.offset = 0;
-    Info.vol = 0;
+    Info.vol = false;
     Info.readMem = true;
     Info.writeMem = false;
     Info.align = 16;
     return true;
-  }
+
   case Intrinsic::nvvm_suld_1d_i16_clamp:
   case Intrinsic::nvvm_suld_1d_v2i16_clamp:
   case Intrinsic::nvvm_suld_1d_v4i16_clamp:
@@ -3642,17 +3659,17 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
   case Intrinsic::nvvm_suld_3d_i16_zero:
   case Intrinsic::nvvm_suld_3d_v2i16_zero:
-  case Intrinsic::nvvm_suld_3d_v4i16_zero: {
+  case Intrinsic::nvvm_suld_3d_v4i16_zero:
     Info.opc = getOpcForSurfaceInstr(Intrinsic);
     Info.memVT = MVT::i16;
     Info.ptrVal = nullptr;
     Info.offset = 0;
-    Info.vol = 0;
+    Info.vol = false;
     Info.readMem = true;
     Info.writeMem = false;
     Info.align = 16;
     return true;
-  }
+
   case Intrinsic::nvvm_suld_1d_i32_clamp:
   case Intrinsic::nvvm_suld_1d_v2i32_clamp:
   case Intrinsic::nvvm_suld_1d_v4i32_clamp:
@@ -3697,17 +3714,17 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
   case Intrinsic::nvvm_suld_3d_i32_zero:
   case Intrinsic::nvvm_suld_3d_v2i32_zero:
-  case Intrinsic::nvvm_suld_3d_v4i32_zero: {
+  case Intrinsic::nvvm_suld_3d_v4i32_zero:
     Info.opc = getOpcForSurfaceInstr(Intrinsic);
     Info.memVT = MVT::i32;
     Info.ptrVal = nullptr;
     Info.offset = 0;
-    Info.vol = 0;
+    Info.vol = false;
     Info.readMem = true;
     Info.writeMem = false;
     Info.align = 16;
     return true;
-  }
+
   case Intrinsic::nvvm_suld_1d_i64_clamp:
   case Intrinsic::nvvm_suld_1d_v2i64_clamp:
   case Intrinsic::nvvm_suld_1d_array_i64_clamp:
@@ -3737,18 +3754,17 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   case Intrinsic::nvvm_suld_2d_array_i64_zero:
   case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
   case Intrinsic::nvvm_suld_3d_i64_zero:
-  case Intrinsic::nvvm_suld_3d_v2i64_zero: {
+  case Intrinsic::nvvm_suld_3d_v2i64_zero:
     Info.opc = getOpcForSurfaceInstr(Intrinsic);
     Info.memVT = MVT::i64;
     Info.ptrVal = nullptr;
     Info.offset = 0;
-    Info.vol = 0;
+    Info.vol = false;
     Info.readMem = true;
     Info.writeMem = false;
     Info.align = 16;
     return true;
   }
-  }
   return false;
 }
 
@@ -3760,7 +3776,6 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
 bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
                                                 const AddrMode &AM, Type *Ty,
                                                 unsigned AS) const {
-
   // AddrMode - This represents an addressing mode of:
   //    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
   //
@@ -4059,7 +4074,7 @@ static SDValue PerformANDCombine(SDNode *N,
     }
 
     bool AddTo = false;
-    if (AExt.getNode() != 0) {
+    if (AExt.getNode() != nullptr) {
       // Re-insert the ext as a zext.
       Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
                             AExt.getValueType(), Val);
@@ -4204,7 +4219,6 @@ static bool IsMulWideOperandDemotable(SDValue Op,
 static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
                                         unsigned OptSize,
                                         bool &IsSigned) {
-
   OperandSignedness LHSSign;
 
   // The LHS operand must be a demotable op
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index 92a88c7f2506..0fbb0448e4c4 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -144,7 +144,7 @@ def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
 def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
 def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
 
-def true : Predicate<"1">;
+def true : Predicate<"true">;
 
 def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
 
diff --git a/lib/Target/NVPTX/NVPTXSection.h b/lib/Target/NVPTX/NVPTXSection.h
index cad4f5668fdf..b0472de980fc 100644
--- a/lib/Target/NVPTX/NVPTXSection.h
+++ b/lib/Target/NVPTX/NVPTXSection.h
@@ -1,4 +1,4 @@
-//===- NVPTXSection.h - NVPTX-specific section representation -*- C++ -*-===//
+//===- NVPTXSection.h - NVPTX-specific section representation ---*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,18 +14,20 @@
 #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSECTION_H
 #define LLVM_LIB_TARGET_NVPTX_NVPTXSECTION_H
 
-#include "llvm/IR/GlobalVariable.h"
 #include "llvm/MC/MCSection.h"
+#include "llvm/MC/SectionKind.h"
 
 namespace llvm {
+
 /// Represents a section in PTX PTX does not have sections. We create this class
 /// in order to use the ASMPrint interface.
 ///
 class NVPTXSection final : public MCSection {
   virtual void anchor();
+
 public:
   NVPTXSection(SectionVariant V, SectionKind K) : MCSection(V, K, nullptr) {}
-  ~NVPTXSection() {}
+  ~NVPTXSection() = default;
 
   /// Override this as NVPTX has its own way of printing switching
   /// to a section.
@@ -40,4 +42,4 @@ public:
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_NVPTX_NVPTXSECTION_H
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 6c68a2c9370d..eb357e0a4d50 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -11,41 +11,28 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "NVPTXTargetMachine.h"
-#include "MCTargetDesc/NVPTXMCAsmInfo.h"
 #include "NVPTX.h"
 #include "NVPTXAllocaHoisting.h"
 #include "NVPTXLowerAggrCopies.h"
+#include "NVPTXTargetMachine.h"
 #include "NVPTXTargetObjectFile.h"
 #include "NVPTXTargetTransformInfo.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Vectorize.h"
+#include <cassert>
+#include <string>
 
 using namespace llvm;
 
@@ -57,6 +44,7 @@ static cl::opt<bool>
                                cl::init(false), cl::Hidden);
 
 namespace llvm {
+
 void initializeNVVMIntrRangePass(PassRegistry&);
 void initializeNVVMReflectPass(PassRegistry&);
 void initializeGenericToNVVMPass(PassRegistry&);
@@ -66,7 +54,8 @@ void initializeNVPTXInferAddressSpacesPass(PassRegistry &);
 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
 void initializeNVPTXLowerArgsPass(PassRegistry &);
 void initializeNVPTXLowerAllocaPass(PassRegistry &);
-}
+
+} // end namespace llvm
 
 extern "C" void LLVMInitializeNVPTXTarget() {
   // Register the target.
@@ -109,7 +98,7 @@ NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
     : LLVMTargetMachine(T, computeDataLayout(is64bit), TT, CPU, FS, Options,
                         Reloc::PIC_, CM, OL),
       is64bit(is64bit),
-      TLOF(make_unique<NVPTXTargetObjectFile>()),
+      TLOF(llvm::make_unique<NVPTXTargetObjectFile>()),
       Subtarget(TT, CPU, FS, *this) {
   if (TT.getOS() == Triple::NVCL)
     drvInterface = NVPTX::NVCL;
@@ -118,7 +107,7 @@ NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
   initAsmInfo();
 }
 
-NVPTXTargetMachine::~NVPTXTargetMachine() {}
+NVPTXTargetMachine::~NVPTXTargetMachine() = default;
 
 void NVPTXTargetMachine32::anchor() {}
 
@@ -141,6 +130,7 @@ NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT,
     : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
 
 namespace {
+
 class NVPTXPassConfig : public TargetPassConfig {
 public:
   NVPTXPassConfig(NVPTXTargetMachine *TM, PassManagerBase &PM)
@@ -170,6 +160,7 @@ private:
   // Add passes that perform straight-line scalar optimizations.
   void addStraightLineScalarOptimizationPasses();
 };
+
 } // end anonymous namespace
 
 TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
diff --git a/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
index dc367a90594a..69c59d0296ab 100644
--- a/lib/Target/NVPTX/NVPTXTargetObjectFile.h
+++ b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
@@ -11,14 +11,13 @@
 #define LLVM_LIB_TARGET_NVPTX_NVPTXTARGETOBJECTFILE_H
 
 #include "NVPTXSection.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/SectionKind.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 
 namespace llvm {
-class GlobalVariable;
-class Module;
 
 class NVPTXTargetObjectFile : public TargetLoweringObjectFile {
-
 public:
   NVPTXTargetObjectFile() {
     TextSection = nullptr;
@@ -43,7 +42,7 @@ public:
     DwarfMacinfoSection = nullptr;
   }
 
-  virtual ~NVPTXTargetObjectFile();
+  ~NVPTXTargetObjectFile() override;
 
   void Initialize(MCContext &ctx, const TargetMachine &TM) override {
     TargetLoweringObjectFile::Initialize(ctx, TM);
@@ -52,7 +51,6 @@ public:
     BSSSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getBSS());
     ReadOnlySection =
         new NVPTXSection(MCSection::SV_ELF, SectionKind::getReadOnly());
-
     StaticCtorSection =
         new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
     StaticDtorSection =
@@ -102,4 +100,4 @@ public:
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_NVPTX_NVPTXTARGETOBJECTFILE_H
diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 48928ee2d540..dd7707084948 100644
--- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -115,7 +115,7 @@ bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) {
 int NVPTXTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
-    TTI::OperandValueProperties Opd2PropInfo) {
+    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index d953aa8a7199..b6c271ae4cbc 100644
--- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -54,7 +54,8 @@ public:
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
-      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
 
   void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
 };
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index f7785342b364..f94d1eab097d 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -281,7 +281,7 @@ unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
 int PPCTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
     TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
-    TTI::OperandValueProperties Opd2PropInfo) {
+    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
 
   // Fallback to the default implementation.
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 8308086ccfaa..30ee2814aba1 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -71,7 +71,8 @@ public:
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
-      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 2081809def70..2d0a06af18ae 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -547,8 +547,26 @@ bool SystemZTargetLowering::isFoldableMemAccessOffset(Instruction *I,
   assert (isa<LoadInst>(I) || isa<StoreInst>(I));
   Type *MemAccessTy = (isa<LoadInst>(I) ? I->getType() :
                        I->getOperand(0)->getType());
-  if (!isUInt<12>(Offset) &&
-      (MemAccessTy->isFloatingPointTy() || MemAccessTy->isVectorTy()))
+  bool IsFPAccess = MemAccessTy->isFloatingPointTy();
+  bool IsVectorAccess = MemAccessTy->isVectorTy();
+
+  // A store of an extracted vector element will be combined into a VSTE type
+  // instruction.
+  if (!IsVectorAccess && isa<StoreInst>(I)) {
+    Value *DataOp = I->getOperand(0);
+    if (isa<ExtractElementInst>(DataOp))
+      IsVectorAccess = true;
+  }
+
+  // A load which gets inserted into a vector element will be combined into a
+  // VLE type instruction.
+  if (!IsVectorAccess && isa<LoadInst>(I) && I->hasOneUse()) {
+    User *LoadUser = *I->user_begin();
+    if (isa<InsertElementInst>(LoadUser))
+      IsVectorAccess = true;
+  }
+
+  if (!isUInt<12>(Offset) && (IsFPAccess || IsVectorAccess))
     return false;
 
   return true;
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index e16ced1661a1..8a6d28490e8c 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -44,7 +44,7 @@ TargetMachine::TargetMachine(const Target &T, StringRef DataLayoutString,
                              const TargetOptions &Options)
     : TheTarget(T), DL(DataLayoutString), TargetTriple(TT), TargetCPU(CPU),
       TargetFS(FS), AsmInfo(nullptr), MRI(nullptr), MII(nullptr), STI(nullptr),
-      RequireStructuredCFG(false), Options(Options) {
+      RequireStructuredCFG(false), DefaultOptions(Options), Options(Options) {
   if (EnableIPRA.getNumOccurrences())
     this->Options.EnableIPRA = EnableIPRA;
 }
@@ -63,14 +63,15 @@ bool TargetMachine::isPositionIndependent() const {
 /// \brief Reset the target options based on the function's attributes.
 // FIXME: This function needs to go away for a number of reasons:
 // a) global state on the TargetMachine is terrible in general,
-// b) there's no default state here to keep,
-// c) these target options should be passed only on the function
+// b) these target options should be passed only on the function
 //    and not on the TargetMachine (via TargetOptions) at all.
 void TargetMachine::resetTargetOptions(const Function &F) const {
 #define RESET_OPTION(X, Y)                                                     \
   do {                                                                         \
     if (F.hasFnAttribute(Y))                                                   \
       Options.X = (F.getFnAttribute(Y).getValueAsString() == "true");          \
+    else                                                                       \
+      Options.X = DefaultOptions.X;                                            \
   } while (0)
 
   RESET_OPTION(LessPreciseFPMADOption, "less-precise-fpmad");
@@ -87,6 +88,8 @@ void TargetMachine::resetTargetOptions(const Function &F) const {
     Options.FPDenormalMode = FPDenormal::PreserveSign;
   else if (Denormal == "positive-zero")
     Options.FPDenormalMode = FPDenormal::PositiveZero;
+  else
+    Options.FPDenormalMode = DefaultOptions.FPDenormalMode;
 }
 
 /// Returns the code generation relocation model. The choices are static, PIC,
diff --git a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 529540ea4ed2..bc7020fded8c 100644
--- a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -663,6 +663,9 @@ bool WebAssemblyFastISel::fastLowerArguments() {
   for (auto const &Arg : F->args())
     MFI->addParam(getLegalType(getSimpleType(Arg.getType())));
 
+  if (!F->getReturnType()->isVoidTy())
+    MFI->addResult(getLegalType(getSimpleType(F->getReturnType())));
+
   return true;
 }
 
diff --git a/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
index d5474a02ce01..adf904ee0269 100644
--- a/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
@@ -62,12 +62,19 @@ ModulePass *llvm::createWebAssemblyFixFunctionBitcasts() {
 // Recursively descend the def-use lists from V to find non-bitcast users of
 // bitcasts of V.
 static void FindUses(Value *V, Function &F,
-                     SmallVectorImpl<std::pair<Use *, Function *>> &Uses) {
+                     SmallVectorImpl<std::pair<Use *, Function *>> &Uses,
+                     SmallPtrSetImpl<Constant *> &ConstantBCs) {
   for (Use &U : V->uses()) {
     if (BitCastOperator *BC = dyn_cast<BitCastOperator>(U.getUser()))
-      FindUses(BC, F, Uses);
-    else if (U.get()->getType() != F.getType())
+      FindUses(BC, F, Uses, ConstantBCs);
+    else if (U.get()->getType() != F.getType()) {
+      if (isa<Constant>(U.get())) {
+        // Only add constant bitcasts to the list once; they get RAUW'd
+        auto c = ConstantBCs.insert(cast<Constant>(U.get()));
+        if (!c.second) continue;
+      }
       Uses.push_back(std::make_pair(&U, &F));
+    }
   }
 }
 
@@ -122,10 +129,10 @@ static Function *CreateWrapper(Function *F, FunctionType *Ty) {
 
 bool FixFunctionBitcasts::runOnModule(Module &M) {
   SmallVector<std::pair<Use *, Function *>, 0> Uses;
+  SmallPtrSet<Constant *, 2> ConstantBCs;
 
   // Collect all the places that need wrappers.
-  for (Function &F : M)
-    FindUses(&F, F, Uses);
+  for (Function &F : M) FindUses(&F, F, Uses, ConstantBCs);
 
   DenseMap<std::pair<Function *, FunctionType *>, Function *> Wrappers;
 
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index bf546dab5fbb..47aadf99e860 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -46,7 +46,7 @@ unsigned WebAssemblyTTIImpl::getRegisterBitWidth(bool Vector) {
 unsigned WebAssemblyTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
-    TTI::OperandValueProperties Opd2PropInfo) {
+    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) {
 
   unsigned Cost = BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
       Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 2a2e3941f82d..f658609f8930 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -61,7 +61,8 @@ public:
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
-      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
   unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
 
   /// @}
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index dc18a59a30ba..83a23d4ad680 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -209,9 +209,9 @@ def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
 def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb",
                                      "HasSlowDivide32", "true",
                                      "Use 8-bit divide for positive values less than 256">;
-def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divw",
+def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divl",
                                      "HasSlowDivide64", "true",
-                                     "Use 16-bit divide for positive values less than 65536">;
+                                     "Use 32-bit divide for positive values less than 2^32">;
 def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
                                      "PadShortFunctions", "true",
                                      "Pad short functions">;
@@ -461,6 +461,7 @@ def SNBFeatures : ProcessorFeatures<[], [
   FeatureCMPXCHG16B,
   FeaturePOPCNT,
   FeatureAES,
+  FeatureSlowDivide64,
   FeaturePCLMUL,
   FeatureXSAVE,
   FeatureXSAVEOPT,
@@ -760,6 +761,42 @@ def : Proc<"bdver4", [
   FeatureMWAITX
 ]>;
 
+// TODO: The scheduler model falls to BTVER2 model.
+// The znver1 model has to be put in place.
+// Zen
+def: ProcessorModel<"znver1", BtVer2Model, [
+  FeatureADX,
+  FeatureAES,
+  FeatureAVX2,
+  FeatureBMI,
+  FeatureBMI2,
+  FeatureCLFLUSHOPT,
+  FeatureCMPXCHG16B,
+  FeatureF16C,
+  FeatureFMA,
+  FeatureFSGSBase,
+  FeatureFXSR,
+  FeatureFastLZCNT,
+  FeatureLAHFSAHF,
+  FeatureLZCNT,
+  FeatureMMX,
+  FeatureMOVBE,
+  FeatureMWAITX,
+  FeaturePCLMUL,
+  FeaturePOPCNT,
+  FeaturePRFCHW,
+  FeatureRDRAND,
+  FeatureRDSEED,
+  FeatureSHA,
+  FeatureSMAP,
+  FeatureSSE4A,
+  FeatureSlowSHLD,
+  FeatureX87,
+  FeatureXSAVE,
+  FeatureXSAVEC,
+  FeatureXSAVEOPT,
+  FeatureXSAVES]>;
+
 def : Proc<"geode",           [FeatureX87, FeatureSlowUAMem16, Feature3DNowA]>;
 
 def : Proc<"winchip-c6",      [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 8b66790679d9..8ab4c0616880 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -183,16 +183,6 @@ namespace {
 
     void PreprocessISelDAG() override;
 
-    inline bool immSext8(SDNode *N) const {
-      return isInt<8>(cast<ConstantSDNode>(N)->getSExtValue());
-    }
-
-    // True if the 64-bit immediate fits in a 32-bit sign-extended field.
-    inline bool i64immSExt32(SDNode *N) const {
-      uint64_t v = cast<ConstantSDNode>(N)->getZExtValue();
-      return (int64_t)v == (int32_t)v;
-    }
-
 // Include the pieces autogenerated from the target description.
 #include "X86GenDAGISel.inc"
 
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index db76ddf04c06..787dff99367e 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -97,12 +97,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
 
-  // Bypass expensive divides on Atom when compiling with O2.
+  // Bypass expensive divides and use cheaper ones.
   if (TM.getOptLevel() >= CodeGenOpt::Default) {
     if (Subtarget.hasSlowDivide32())
       addBypassSlowDiv(32, 8);
     if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
-      addBypassSlowDiv(64, 16);
+      addBypassSlowDiv(64, 32);
   }
 
   if (Subtarget.isTargetKnownWindowsMSVC() ||
@@ -1280,6 +1280,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FP_TO_UINT,       MVT::v4i32, Legal);
       setOperationAction(ISD::ZERO_EXTEND,      MVT::v4i32, Custom);
       setOperationAction(ISD::ZERO_EXTEND,      MVT::v2i64, Custom);
+      setOperationAction(ISD::SIGN_EXTEND,      MVT::v4i32, Custom);
+      setOperationAction(ISD::SIGN_EXTEND,      MVT::v2i64, Custom);
 
       // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
       setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8,  Legal);
@@ -1306,10 +1308,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
-    if (Subtarget.hasDQI()) {
-      setOperationAction(ISD::SIGN_EXTEND,        MVT::v4i32, Custom);
-      setOperationAction(ISD::SIGN_EXTEND,        MVT::v2i64, Custom);
-    }
+
     for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
       setOperationAction(ISD::FFLOOR,     VT, Legal);
       setOperationAction(ISD::FCEIL,      VT, Legal);
@@ -8090,6 +8089,37 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
   return Zeroable;
 }
 
+// The Shuffle result is as follow:
+// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
+// Each Zeroable's element correspond to a particular Mask's element.
+// As described in computeZeroableShuffleElements function.
+//
+// The function looks for a sub-mask that the nonzero elements are in
+// increasing order. If such sub-mask exist. The function returns true.
+static bool isNonZeroElementsInOrder(const SmallBitVector Zeroable,
+                                     ArrayRef<int> Mask,const EVT &VectorType,
+                                     bool &IsZeroSideLeft) {
+  int NextElement = -1;
+  // Check if the Mask's nonzero elements are in increasing order.
+  for (int i = 0, e = Zeroable.size(); i < e; i++) {
+    // Checks if the mask's zeros elements are built from only zeros.
+    if (Mask[i] == -1)
+      return false;
+    if (Zeroable[i])
+      continue;
+    // Find the lowest non zero element
+    if (NextElement == -1) {
+      NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
+      IsZeroSideLeft = NextElement != 0;
+    }
+    // Exit if the mask's non zero elements are not in increasing order.
+    if (NextElement != Mask[i])
+      return false;
+    NextElement++;
+  }
+  return true;
+}
+
 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
                                             ArrayRef<int> Mask, SDValue V1,
@@ -8145,6 +8175,46 @@ static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
                       DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
 }
 
+static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
+                           const X86Subtarget &Subtarget, SelectionDAG &DAG,
+                           const SDLoc &dl);
+
+// Function convertBitVectorToUnsigned - The function gets SmallBitVector
+// as argument and convert him to unsigned.
+// The output of the function is not(zeroable)
+static unsigned convertBitVectorToUnsiged(const SmallBitVector &Zeroable) {
+  unsigned convertBit = 0;
+  for (int i = 0, e = Zeroable.size(); i < e; i++)
+    convertBit |= !(Zeroable[i]) << i;
+  return convertBit;
+}
+
+// X86 has dedicated shuffle that can be lowered to VEXPAND
+static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
+                                          const SmallBitVector &Zeroable,
+                                          ArrayRef<int> Mask, SDValue &V1,
+                                          SDValue &V2, SelectionDAG &DAG,
+                                          const X86Subtarget &Subtarget) {
+  bool IsLeftZeroSide = true;
+  if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
+                                IsLeftZeroSide))
+    return SDValue();
+  unsigned VEXPANDMask = convertBitVectorToUnsiged(Zeroable);
+  MVT IntegerType =
+      MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
+  SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
+  unsigned NumElts = VT.getVectorNumElements();
+  assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
+         "Unexpected number of vector elements");
+  SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
+                              Subtarget, DAG, DL);
+  SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
+  SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
+  return DAG.getNode(ISD::VSELECT, DL, VT, VMask,
+                     DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
+                     ZeroVector);
+}
+
 // X86 has dedicated unpack instructions that can handle specific blend
 // operations: UNPCKH and UNPCKL.
 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
@@ -12159,6 +12229,11 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
       return Result;
+  // If we have VLX support, we can use VEXPAND.
+  if (Subtarget.hasVLX())
+    if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
+                                               V1, V2, DAG, Subtarget))
+      return V;
 
   // If we have AVX2 then we always want to lower with a blend because an v4 we
   // can fully permute the elements.
@@ -12222,12 +12297,17 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
-  // If we have VLX support, we can use VALIGN.
-  if (Subtarget.hasVLX())
+  // If we have VLX support, we can use VALIGN or VEXPAND.
+  if (Subtarget.hasVLX()) {
     if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
                                                     Mask, Subtarget, DAG))
       return Rotate;
 
+    if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
+                                               V1, V2, DAG, Subtarget))
+      return V;
+  }
+
   // Try to use PALIGNR.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
                                                       Mask, Subtarget, DAG))
@@ -12328,6 +12408,11 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
     return Result;
+  // If we have VLX support, we can use VEXPAND.
+  if (Subtarget.hasVLX())
+    if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
+                                               V1, V2, DAG, Subtarget))
+      return V;
 
   // If we have AVX2 then we always want to lower with a blend because at v8 we
   // can fully permute the elements.
@@ -12392,12 +12477,17 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
-  // If we have VLX support, we can use VALIGN.
-  if (Subtarget.hasVLX())
+  // If we have VLX support, we can use VALIGN or EXPAND.
+  if (Subtarget.hasVLX()) {
     if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
                                                     Mask, Subtarget, DAG))
       return Rotate;
 
+    if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
+                                               V1, V2, DAG, Subtarget))
+      return V;
+  }
+
   // Try to use byte rotation instructions.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
@@ -12754,6 +12844,7 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
 
 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       const SmallBitVector &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -12796,11 +12887,16 @@ static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
       lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
     return Op;
 
+  if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
+                                             V2, DAG, Subtarget))
+    return V;
+
   return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
 }
 
 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
 static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
+                                        const SmallBitVector &Zeroable,
                                         SDValue V1, SDValue V2,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
@@ -12832,6 +12928,10 @@ static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
     // Otherwise, fall back to a SHUFPS sequence.
     return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
   }
+  // If we have AVX512F support, we can use VEXPAND.
+  if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
+                                             V1, V2, DAG, Subtarget))
+    return V;
 
   return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
 }
@@ -12889,6 +12989,10 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   if (SDValue Unpck =
           lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
     return Unpck;
+  // If we have AVX512F support, we can use VEXPAND.
+  if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
+                                             V2, DAG, Subtarget))
+    return V;
 
   return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
 }
@@ -12953,6 +13057,10 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                   CastV1, CastV2, DAG);
     return DAG.getBitcast(MVT::v16i32, ShufPS);
   }
+  // If we have AVX512F support, we can use VEXPAND.
+  if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
+                                             V1, V2, DAG, Subtarget))
+    return V;
 
   return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
 }
@@ -13089,9 +13197,9 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // the requisite ISA extensions for that element type are available.
   switch (VT.SimpleTy) {
   case MVT::v8f64:
-    return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+    return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v16f32:
-    return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+    return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v8i64:
     return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v16i32:
@@ -15187,13 +15295,13 @@ static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
   MVT InVT = In.getSimpleValueType();
   SDLoc DL(Op);
   unsigned NumElts = VT.getVectorNumElements();
-  if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
-    return SDValue();
 
-  if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
+  if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1 &&
+      (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI()))
     return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
 
-  assert(InVT.getVectorElementType() == MVT::i1);
+  if (InVT.getVectorElementType() != MVT::i1)
+    return SDValue();
 
   // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
   MVT ExtVT = VT;
@@ -15910,6 +16018,12 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
       }
   }
 
+  // Sometimes flags can be set either with an AND or with an SRL/SHL
+  // instruction. SRL/SHL variant should be preferred for masks longer than this
+  // number of bits.
+  const int ShiftToAndMaxMaskWidth = 32;
+  const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
+
   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
   // which may be the result of a CAST.  We use the variable 'Op', which is the
   // non-casted variable when we check for possible users.
@@ -15958,7 +16072,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
     // If we have a constant logical shift that's only used in a comparison
     // against zero turn it into an equivalent AND. This allows turning it into
     // a TEST instruction later.
-    if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
+    if (ZeroCheck && Op->hasOneUse() &&
         isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
       EVT VT = Op.getValueType();
       unsigned BitWidth = VT.getSizeInBits();
@@ -15968,7 +16082,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
       APInt Mask = ArithOp.getOpcode() == ISD::SRL
                        ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
                        : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
-      if (!Mask.isSignedIntN(32)) // Avoid large immediates.
+      if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
         break;
       Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
                        DAG.getConstant(Mask, dl, VT));
@@ -15977,18 +16091,59 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
 
   case ISD::AND:
     // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
-    // because a TEST instruction will be better.
+    // because a TEST instruction will be better. However, AND should be
+    // preferred if the instruction can be combined into ANDN.
     if (!hasNonFlagsUse(Op)) {
       SDValue Op0 = ArithOp->getOperand(0);
       SDValue Op1 = ArithOp->getOperand(1);
       EVT VT = ArithOp.getValueType();
       bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
       bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
+      bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
+
+      // If we cannot select an ANDN instruction, check if we can replace
+      // AND+IMM64 with a shift before giving up. This is possible for masks
+      // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
+      if (!isProperAndn) {
+        if (!ZeroCheck)
+          break;
+
+        assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
+        auto *CN = dyn_cast<ConstantSDNode>(Op1);
+        if (!CN)
+          break;
+
+        const APInt &Mask = CN->getAPIntValue();
+        if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
+          break; // Prefer TEST instruction.
+
+        unsigned BitWidth = Mask.getBitWidth();
+        unsigned LeadingOnes = Mask.countLeadingOnes();
+        unsigned TrailingZeros = Mask.countTrailingZeros();
+
+        if (LeadingOnes + TrailingZeros == BitWidth) {
+          assert(TrailingZeros < VT.getSizeInBits() &&
+                 "Shift amount should be less than the type width");
+          MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
+          SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
+          Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
+          break;
+        }
+
+        unsigned LeadingZeros = Mask.countLeadingZeros();
+        unsigned TrailingOnes = Mask.countTrailingOnes();
+
+        if (LeadingZeros + TrailingOnes == BitWidth) {
+          assert(LeadingZeros < VT.getSizeInBits() &&
+                 "Shift amount should be less than the type width");
+          MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
+          SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
+          Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
+          break;
+        }
 
-      // But if we can combine this into an ANDN operation, then create an AND
-      // now and allow it to be pattern matched into an ANDN.
-      if (!Subtarget.hasBMI() || !isAndn || !isLegalAndnType)
         break;
+      }
     }
     LLVM_FALLTHROUGH;
   case ISD::SUB:
@@ -16008,7 +16163,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
     case ISD::XOR: Opcode = X86ISD::XOR; break;
     case ISD::AND: Opcode = X86ISD::AND; break;
     case ISD::OR: {
-      if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
+      if (!NeedTruncation && ZeroCheck) {
         if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
           return EFLAGS;
       }
@@ -17283,17 +17438,20 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
 
   unsigned NumElts = VT.getVectorNumElements();
 
-  if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
-    return SDValue();
-
-  if (VT.is512BitVector() && InVTElt != MVT::i1) {
+  if (VT.is512BitVector() && InVTElt != MVT::i1 &&
+      (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) {
     if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
       return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
   }
 
-  assert (InVTElt == MVT::i1 && "Unexpected vector type");
-  MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
+  if (InVTElt != MVT::i1)
+    return SDValue();
+
+  MVT ExtVT = VT;
+  if (!VT.is512BitVector() && !Subtarget.hasVLX())
+    ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
+
   SDValue V;
   if (Subtarget.hasDQI()) {
     V = DAG.getNode(X86ISD::VSEXT, dl, ExtVT, In);
@@ -17302,7 +17460,7 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
     SDValue NegOne = getOnesVector(ExtVT, Subtarget, DAG, dl);
     SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
     V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
-    if (VT.is512BitVector())
+    if (ExtVT == VT)
       return V;
   }
 
@@ -18418,13 +18576,13 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
   else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
            ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
-    SDValue Op0 = ShAmt.getOperand(0);
-    Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
-    ShAmt = DAG.getZeroExtendVectorInReg(Op0, SDLoc(Op0), MVT::v2i64);
+    ShAmt = ShAmt.getOperand(0);
+    ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
+    ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt);
   } else if (Subtarget.hasSSE41() &&
              ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
-    ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
+    ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt);
   } else {
     SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
                                      DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
@@ -21643,14 +21801,26 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
   }
 
   if (VT == MVT::v16i8 ||
-      (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) {
+      (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
+      (VT == MVT::v64i8 && Subtarget.hasBWI())) {
     MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
     unsigned ShiftOpcode = Op->getOpcode();
 
     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
-      // On SSE41 targets we make use of the fact that VSELECT lowers
-      // to PBLENDVB which selects bytes based just on the sign bit.
-      if (Subtarget.hasSSE41()) {
+      if (VT.is512BitVector()) {
+        // On AVX512BW targets we make use of the fact that VSELECT lowers
+        // to a masked blend which selects bytes based just on the sign bit
+        // extracted to a mask.
+        MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+        V0 = DAG.getBitcast(VT, V0);
+        V1 = DAG.getBitcast(VT, V1);
+        Sel = DAG.getBitcast(VT, Sel);
+        Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
+        return DAG.getBitcast(SelVT,
+                              DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
+      } else if (Subtarget.hasSSE41()) {
+        // On SSE41 targets we make use of the fact that VSELECT lowers
+        // to PBLENDVB which selects bytes based just on the sign bit.
         V0 = DAG.getBitcast(VT, V0);
         V1 = DAG.getBitcast(VT, V1);
         Sel = DAG.getBitcast(VT, Sel);
@@ -28633,17 +28803,20 @@ static SDValue combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
   if (N->getOpcode() != ISD::VSELECT)
     return SDValue();
 
+  assert(CondVT.isVector() && "Vector select expects a vector selector!");
+
   bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
-  // Check if the first operand is all zeros.This situation only
-  // applies to avx512.
-  if (FValIsAllZeros  && Subtarget.hasAVX512() && Cond.hasOneUse()) {
+  // Check if the first operand is all zeros and Cond type is vXi1.
+  // This situation only applies to avx512.
+  if (FValIsAllZeros  && Subtarget.hasAVX512() && Cond.hasOneUse() &&
+      CondVT.getVectorElementType() == MVT::i1) {
       //Invert the cond to not(cond) : xor(op,allones)=not(op)
       SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
-        DAG.getConstant(1, DL, Cond.getValueType()));
+        DAG.getConstant(APInt::getAllOnesValue(CondVT.getScalarSizeInBits()),
+                        DL, CondVT));
       //Vselect cond, op1, op2 = Vselect not(cond), op2, op1
       return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS);
   }
-  assert(CondVT.isVector() && "Vector select expects a vector selector!");
 
   // To use the condition operand as a bitwise mask, it must have elements that
   // are the same size as the select elements. Ie, the condition operand must
@@ -29282,11 +29455,19 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// Combine:
+/// Combine brcond/cmov/setcc/.. based on comparing the result of
+/// atomic_load_add to use EFLAGS produced by the addition
+/// directly if possible. For example:
+///
+///   (setcc (cmp (atomic_load_add x, -C) C), COND_E)
+/// becomes:
+///   (setcc (LADD x, -C), COND_E)
+///
+/// and
 ///   (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
-/// to:
+/// becomes:
 ///   (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
-/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
+///
 /// Note that this is only legal for some op/cc combinations.
 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
                                        SelectionDAG &DAG) {
@@ -29295,7 +29476,13 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
     return SDValue();
 
-  // This only applies to variations of the common case:
+  // Can't replace the cmp if it has more uses than the one we're looking at.
+  // FIXME: We would like to be able to handle this, but would need to make sure
+  // all uses were updated.
+  if (!Cmp.hasOneUse())
+    return SDValue();
+
+  // This applies to variations of the common case:
   //   (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
   //   (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
   //   (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
@@ -29314,8 +29501,9 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
     return SDValue();
 
   auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
-  if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
+  if (!CmpRHSC)
     return SDValue();
+  APInt Comparand = CmpRHSC->getAPIntValue();
 
   const unsigned Opc = CmpLHS.getOpcode();
 
@@ -29331,16 +29519,19 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
   if (Opc == ISD::ATOMIC_LOAD_SUB)
     Addend = -Addend;
 
-  if (CC == X86::COND_S && Addend == 1)
+  if (Comparand == -Addend) {
+    // No change to CC.
+  } else if (CC == X86::COND_S && Comparand == 0 && Addend == 1) {
     CC = X86::COND_LE;
-  else if (CC == X86::COND_NS && Addend == 1)
+  } else if (CC == X86::COND_NS && Comparand == 0 && Addend == 1) {
     CC = X86::COND_G;
-  else if (CC == X86::COND_G && Addend == -1)
+  } else if (CC == X86::COND_G && Comparand == 0 && Addend == -1) {
     CC = X86::COND_GE;
-  else if (CC == X86::COND_LE && Addend == -1)
+  } else if (CC == X86::COND_LE && Comparand == 0 && Addend == -1) {
     CC = X86::COND_L;
-  else
+  } else {
     return SDValue();
+  }
 
   SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
@@ -31083,10 +31274,15 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
 
 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
 /// is valid for the given \p Subtarget.
-static bool
-isSATValidOnSubtarget(EVT SrcVT, EVT DstVT, const X86Subtarget &Subtarget) {
+static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
+                                        const X86Subtarget &Subtarget) {
   if (!Subtarget.hasAVX512())
     return false;
+
+  // FIXME: Scalar type may be supported if we move it to vector register.
+  if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
+    return false;
+
   EVT SrcElVT = SrcVT.getScalarType();
   EVT DstElVT = DstVT.getScalarType();
   if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
@@ -31098,40 +31294,69 @@ isSATValidOnSubtarget(EVT SrcVT, EVT DstVT, const X86Subtarget &Subtarget) {
   return false;
 }
 
+/// Return true if VPACK* instruction can be used for the given types
+/// and it is avalable on \p Subtarget.
+static bool
+isSATValidOnSSESubtarget(EVT SrcVT, EVT DstVT, const X86Subtarget &Subtarget) {
+  if (Subtarget.hasSSE2())
+    // v16i16 -> v16i8
+    if (SrcVT == MVT::v16i16 && DstVT == MVT::v16i8)
+      return true;
+  if (Subtarget.hasSSE41())
+    // v8i32 -> v8i16
+    if (SrcVT == MVT::v8i32 && DstVT == MVT::v8i16)
+      return true;
+  return false;
+}
+
 /// Detect a pattern of truncation with saturation:
 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
 /// Return the source value to be truncated or SDValue() if the pattern was not
-/// matched or the unsupported on the current target.
-static SDValue
-detectUSatPattern(SDValue In, EVT VT, const X86Subtarget &Subtarget) {
+/// matched.
+static SDValue detectUSatPattern(SDValue In, EVT VT) {
   if (In.getOpcode() != ISD::UMIN)
     return SDValue();
 
-  EVT InVT = In.getValueType();
-  // FIXME: Scalar type may be supported if we move it to vector register.
-  if (!InVT.isVector() || !InVT.isSimple())
-    return SDValue();
-
-  if (!isSATValidOnSubtarget(InVT, VT, Subtarget))
-    return SDValue();
-
   //Saturation with truncation. We truncate from InVT to VT.
-  assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
+  assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
     "Unexpected types for truncate operation");
 
-  SDValue SrcVal;
   APInt C;
-  if (ISD::isConstantSplatVector(In.getOperand(0).getNode(), C))
-    SrcVal = In.getOperand(1);
-  else if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C))
-    SrcVal = In.getOperand(0);
-  else
+  if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
+    // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
+    // the element size of the destination type.
+    return APIntOps::isMask(VT.getScalarSizeInBits(), C) ? In.getOperand(0) :
+      SDValue();
+  }
+  return SDValue();
+}
+
+/// Detect a pattern of truncation with saturation:
+/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
+/// The types should allow to use VPMOVUS* instruction on AVX512.
+/// Return the source value to be truncated or SDValue() if the pattern was not
+/// matched.
+static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
+                                       const X86Subtarget &Subtarget) {
+  if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
     return SDValue();
+  return detectUSatPattern(In, VT);
+}
 
-  // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
-  // the element size of the destination type.
-  return (C == ((uint64_t)1 << VT.getScalarSizeInBits()) - 1) ?
-    SrcVal : SDValue();
+static SDValue
+combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
+                        const X86Subtarget &Subtarget) {
+  SDValue USatVal = detectUSatPattern(In, VT);
+  if (USatVal) {
+    if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
+      return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
+    if (isSATValidOnSSESubtarget(In.getValueType(), VT, Subtarget)) {
+      SDValue Lo, Hi;
+      std::tie(Lo, Hi) = DAG.SplitVector(USatVal, DL);
+      return DAG.getNode(X86ISD::PACKUS, DL, VT, Lo, Hi);
+    }
+  }
+  return SDValue();
 }
 
 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
@@ -31701,7 +31926,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
                           St->getMemOperand()->getFlags());
 
     if (SDValue Val =
-        detectUSatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
+        detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
       return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
                              dl, Val, St->getBasePtr(),
                              St->getMemoryVT(), St->getMemOperand(), DAG);
@@ -32326,9 +32551,9 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
   if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
     return Avg;
 
-  // Try the truncation with unsigned saturation.
-  if (SDValue Val = detectUSatPattern(Src, VT, Subtarget))
-    return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Val);
+  // Try to combine truncation with unsigned saturation.
+  if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
+    return Val;
 
   // The bitcast source is a direct mmx result.
   // Detect bitcasts between i32 to x86mmx
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index d44d1395f243..230d1700b8d2 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -5957,6 +5957,30 @@ let Predicates = [HasAVX512] in {
             (VCVTUSI2SDZrm_Int VR128X:$src1, addr:$src2)>;
 } // Predicates = [HasAVX512]
 
+// Patterns used for matching vcvtsi2s{s,d} intrinsic sequences from clang
+// which produce unnecessary vmovs{s,d} instructions
+let Predicates = [HasAVX512] in {
+def : Pat<(v4f32 (X86Movss
+                   (v4f32 VR128X:$dst),
+                   (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
+          (VCVTSI642SSZrr_Int VR128X:$dst, GR64:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+                   (v4f32 VR128X:$dst),
+                   (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
+          (VCVTSI2SSZrr_Int VR128X:$dst, GR32:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+                   (v2f64 VR128X:$dst),
+                   (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
+          (VCVTSI642SDZrr_Int VR128X:$dst, GR64:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+                   (v2f64 VR128X:$dst),
+                   (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
+          (VCVTSI2SDZrr_Int VR128X:$dst, GR32:$src)>;
+} // Predicates = [HasAVX512]
+
 // Convert float/double to signed/unsigned int 32/64 with truncation
 multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC,
                             X86VectorVTInfo _DstRC, SDNode OpNode,
@@ -6136,6 +6160,21 @@ def : Pat<(f32 (fpround FR64X:$src)),
           (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X),
                     (COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>,
            Requires<[HasAVX512]>;
+
+def : Pat<(v4f32 (X86Movss
+                   (v4f32 VR128X:$dst),
+                   (v4f32 (scalar_to_vector
+                     (f32 (fpround (f64 (extractelt VR128X:$src, (iPTR 0))))))))),
+          (VCVTSD2SSZrr VR128X:$dst, VR128X:$src)>,
+          Requires<[HasAVX512]>;
+
+def : Pat<(v2f64 (X86Movsd
+                   (v2f64 VR128X:$dst),
+                   (v2f64 (scalar_to_vector
+                     (f64 (fpextend (f32 (extractelt VR128X:$src, (iPTR 0))))))))),
+          (VCVTSS2SDZrr VR128X:$dst, VR128X:$src)>,
+          Requires<[HasAVX512]>;
+
 //===----------------------------------------------------------------------===//
 // AVX-512  Vector convert from signed/unsigned integer to float/double
 //          and from float/double to signed/unsigned integer
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 09971d586a41..1812d01711d1 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -33,7 +33,6 @@ class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
   InstrItinClass ri = arg_ri;
 }
 
-
 // scalar
 let Sched = WriteFAdd in {
 def SSE_ALU_F32S : OpndItins<
@@ -1923,6 +1922,79 @@ def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
 }
 } // isCodeGenOnly = 1
 
+// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
+// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
+// vmovs{s,d} instructions
+let Predicates = [UseAVX] in {
+def : Pat<(v4f32 (X86Movss
+                   (v4f32 VR128:$dst),
+                   (v4f32 (scalar_to_vector
+                     (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
+          (Int_VCVTSD2SSrr VR128:$dst, VR128:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+                   (v2f64 VR128:$dst),
+                   (v2f64 (scalar_to_vector
+                     (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
+          (Int_VCVTSS2SDrr VR128:$dst, VR128:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+                   (v4f32 VR128:$dst),
+                   (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
+          (Int_VCVTSI2SS64rr VR128:$dst, GR64:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+                   (v4f32 VR128:$dst),
+                   (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
+          (Int_VCVTSI2SSrr VR128:$dst, GR32:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+                   (v2f64 VR128:$dst),
+                   (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
+          (Int_VCVTSI2SD64rr VR128:$dst, GR64:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+                   (v2f64 VR128:$dst),
+                   (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
+          (Int_VCVTSI2SDrr VR128:$dst, GR32:$src)>;
+} // Predicates = [UseAVX]
+
+let Predicates = [UseSSE2] in {
+def : Pat<(v4f32 (X86Movss
+                   (v4f32 VR128:$dst),
+                   (v4f32 (scalar_to_vector
+                     (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
+          (Int_CVTSD2SSrr VR128:$dst, VR128:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+                   (v2f64 VR128:$dst),
+                   (v2f64 (scalar_to_vector
+                     (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
+          (Int_CVTSS2SDrr VR128:$dst, VR128:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+                   (v2f64 VR128:$dst),
+                   (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
+          (Int_CVTSI2SD64rr VR128:$dst, GR64:$src)>;
+
+def : Pat<(v2f64 (X86Movsd
+                   (v2f64 VR128:$dst),
+                   (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
+          (Int_CVTSI2SDrr VR128:$dst, GR32:$src)>;
+} // Predicates = [UseSSE2]
+
+let Predicates = [UseSSE1] in {
+def : Pat<(v4f32 (X86Movss
+                   (v4f32 VR128:$dst),
+                   (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
+          (Int_CVTSI2SS64rr VR128:$dst, GR64:$src)>;
+
+def : Pat<(v4f32 (X86Movss
+                   (v4f32 VR128:$dst),
+                   (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
+          (Int_CVTSI2SSrr VR128:$dst, GR32:$src)>;
+} // Predicates = [UseSSE1]
+
 // Convert packed single/double fp to doubleword
 def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtps2dq\t{$src, $dst|$dst, $src}",
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 92c16214aa4a..d80dc4a9b5e8 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -216,7 +216,7 @@ protected:
   /// 32-bit divisions and should be used when possible.
   bool HasSlowDivide32;
 
-  /// True if 16-bit divides are significantly faster than
+  /// True if 32-bit divides are significantly faster than
   /// 64-bit divisions and should be used when possible.
   bool HasSlowDivide64;
 
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 107ed9359376..5715d826862e 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -114,15 +114,62 @@ unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
 }
 
 int X86TTIImpl::getArithmeticInstrCost(
-    unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
-    TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
-    TTI::OperandValueProperties Opd2PropInfo) {
+    unsigned Opcode, Type *Ty,  
+    TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
+    TTI::OperandValueProperties Opd1PropInfo,
+    TTI::OperandValueProperties Opd2PropInfo,
+    ArrayRef<const Value *> Args) {
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
+  static const CostTblEntry SLMCostTable[] = {
+    { ISD::MUL,  MVT::v4i32, 11 }, // pmulld
+    { ISD::MUL,  MVT::v8i16, 2  }, // pmullw
+    { ISD::MUL,  MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
+    { ISD::FMUL, MVT::f64,   2  }, // mulsd
+    { ISD::FMUL, MVT::v2f64, 4  }, // mulpd
+    { ISD::FMUL, MVT::v4f32, 2  }, // mulps
+    { ISD::FDIV, MVT::f32,   17 }, // divss
+    { ISD::FDIV, MVT::v4f32, 39 }, // divps
+    { ISD::FDIV, MVT::f64,   32 }, // divsd
+    { ISD::FDIV, MVT::v2f64, 69 }, // divpd
+    { ISD::FADD, MVT::v2f64, 2  }, // addpd
+    { ISD::FSUB, MVT::v2f64, 2  }, // subpd
+    // v2i64/v4i64 mul is custom lowered as a series of long
+    // multiplies(3), shifts(3) and adds(2).
+    // slm muldq version throughput is 2
+    { ISD::MUL,  MVT::v2i64, 11 },
+  };
+
+  if (ST->isSLM()) {
+    if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
+      // Check if the operands can be shrinked into a smaller datatype.
+      bool Op1Signed = false;
+      unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
+      bool Op2Signed = false;
+      unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
+
+      bool signedMode = Op1Signed | Op2Signed;
+      unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
+
+      if (OpMinSize <= 7)
+        return LT.first * 3; // pmullw/sext
+      if (!signedMode && OpMinSize <= 8)
+        return LT.first * 3; // pmullw/zext
+      if (OpMinSize <= 15)
+        return LT.first * 5; // pmullw/pmulhw/pshuf
+      if (!signedMode && OpMinSize <= 16)
+        return LT.first * 5; // pmullw/pmulhw/pshuf
+    }
+    if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
+                                            LT.second)) {
+      return LT.first * Entry->Cost;
+    }
+  }
+
   if (ISD == ISD::SDIV &&
       Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
       Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
@@ -276,6 +323,10 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::SRL,   MVT::v32i16,     1 }, // vpsrlvw
     { ISD::SRA,   MVT::v32i16,     1 }, // vpsravw
 
+    { ISD::SHL,   MVT::v64i8,     11 }, // vpblendvb sequence.
+    { ISD::SRL,   MVT::v64i8,     11 }, // vpblendvb sequence.
+    { ISD::SRA,   MVT::v64i8,     24 }, // vpblendvb sequence.
+
     { ISD::MUL,   MVT::v64i8,     11 }, // extend/pmullw/trunc sequence.
     { ISD::MUL,   MVT::v32i8,      4 }, // extend/pmullw/trunc sequence.
     { ISD::MUL,   MVT::v16i8,      4 }, // extend/pmullw/trunc sequence.
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index c013805f4321..ecaaf951cff7 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -60,7 +60,8 @@ public:
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
-      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
diff --git a/lib/Transforms/IPO/LowerTypeTests.cpp b/lib/Transforms/IPO/LowerTypeTests.cpp
index 82daf754be0d..deb7e819480b 100644
--- a/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -270,12 +270,12 @@ class LowerTypeTestsModule {
     /// relative to the start address.
     Constant *AlignLog2;
 
-    /// ByteArray, Inline, AllOnes: size of the memory region covering members
-    /// of this type identifier as a multiple of 2^AlignLog2.
-    Constant *Size;
+    /// ByteArray, Inline, AllOnes: one less than the size of the memory region
+    /// covering members of this type identifier as a multiple of 2^AlignLog2.
+    Constant *SizeM1;
 
-    /// ByteArray, Inline, AllOnes: range of the size expressed as a bit width.
-    unsigned SizeBitWidth;
+    /// ByteArray, Inline, AllOnes: range of SizeM1 expressed as a bit width.
+    unsigned SizeM1BitWidth;
 
     /// ByteArray: the byte array to test the address against.
     Constant *TheByteArray;
@@ -593,8 +593,8 @@ Value *LowerTypeTestsModule::lowerTypeTestCall(Metadata *TypeId, CallInst *CI,
                      IntPtrTy));
   Value *BitOffset = B.CreateOr(OffsetSHR, OffsetSHL);
 
-  Constant *BitSizeConst = ConstantExpr::getZExt(TIL.Size, IntPtrTy);
-  Value *OffsetInRange = B.CreateICmpULT(BitOffset, BitSizeConst);
+  Constant *BitSizeConst = ConstantExpr::getZExt(TIL.SizeM1, IntPtrTy);
+  Value *OffsetInRange = B.CreateICmpULE(BitOffset, BitSizeConst);
 
   // If the bit set is all ones, testing against it is unnecessary.
   if (TIL.TheKind == TypeTestResolution::AllOnes)
@@ -711,13 +711,13 @@ void LowerTypeTestsModule::lowerTypeTestCalls(
     if (BSI.isAllOnes()) {
       TIL.TheKind = (BSI.BitSize == 1) ? TypeTestResolution::Single
                                        : TypeTestResolution::AllOnes;
-      TIL.SizeBitWidth = (BSI.BitSize <= 256) ? 8 : 32;
-      TIL.Size = ConstantInt::get((BSI.BitSize <= 256) ? Int8Ty : Int32Ty,
-                                  BSI.BitSize);
+      TIL.SizeM1BitWidth = (BSI.BitSize <= 128) ? 7 : 32;
+      TIL.SizeM1 = ConstantInt::get((BSI.BitSize <= 128) ? Int8Ty : Int32Ty,
+                                    BSI.BitSize - 1);
     } else if (BSI.BitSize <= 64) {
       TIL.TheKind = TypeTestResolution::Inline;
-      TIL.SizeBitWidth = (BSI.BitSize <= 32) ? 5 : 6;
-      TIL.Size = ConstantInt::get(Int8Ty, BSI.BitSize);
+      TIL.SizeM1BitWidth = (BSI.BitSize <= 32) ? 5 : 6;
+      TIL.SizeM1 = ConstantInt::get(Int8Ty, BSI.BitSize - 1);
       uint64_t InlineBits = 0;
       for (auto Bit : BSI.Bits)
         InlineBits |= uint64_t(1) << Bit;
@@ -728,9 +728,9 @@ void LowerTypeTestsModule::lowerTypeTestCalls(
             (BSI.BitSize <= 32) ? Int32Ty : Int64Ty, InlineBits);
     } else {
       TIL.TheKind = TypeTestResolution::ByteArray;
-      TIL.SizeBitWidth = (BSI.BitSize <= 256) ? 8 : 32;
-      TIL.Size = ConstantInt::get((BSI.BitSize <= 256) ? Int8Ty : Int32Ty,
-                                  BSI.BitSize);
+      TIL.SizeM1BitWidth = (BSI.BitSize <= 128) ? 7 : 32;
+      TIL.SizeM1 = ConstantInt::get((BSI.BitSize <= 128) ? Int8Ty : Int32Ty,
+                                    BSI.BitSize - 1);
       ++NumByteArraysCreated;
       ByteArrayInfo *BAI = createByteArray(BSI);
       TIL.TheByteArray = BAI->ByteArray;
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 55151c13b430..2d34c1cc74bd 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1371,15 +1371,9 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
           SimplifyFAddInst(LHS, RHS, I.getFastMathFlags(), DL, &TLI, &DT, &AC))
     return replaceInstUsesWith(I, V);
 
-  if (isa<Constant>(RHS)) {
-    if (isa<PHINode>(LHS))
-      if (Instruction *NV = FoldOpIntoPhi(I))
-        return NV;
-
-    if (SelectInst *SI = dyn_cast<SelectInst>(LHS))
-      if (Instruction *NV = FoldOpIntoSelect(I, SI))
-        return NV;
-  }
+  if (isa<Constant>(RHS))
+    if (Instruction *FoldedFAdd = foldOpWithConstantIntoOperand(I))
+      return FoldedFAdd;
 
   // -A + B  -->  B - A
   // -A + -B  -->  -(A + B)
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index a59b43d6af5f..da5384a86aac 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1382,13 +1382,8 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
       }
     }
 
-    // Try to fold constant and into select arguments.
-    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
-      if (Instruction *R = FoldOpIntoSelect(I, SI))
-        return R;
-    if (isa<PHINode>(Op0))
-      if (Instruction *NV = FoldOpIntoPhi(I))
-        return NV;
+    if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
+      return FoldedLogic;
   }
 
   if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
@@ -2125,14 +2120,8 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
                             Builder->getInt(C1->getValue() & ~RHS->getValue()));
     }
 
-    // Try to fold constant and into select arguments.
-    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
-      if (Instruction *R = FoldOpIntoSelect(I, SI))
-        return R;
-
-    if (isa<PHINode>(Op0))
-      if (Instruction *NV = FoldOpIntoPhi(I))
-        return NV;
+    if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
+      return FoldedLogic;
   }
 
   // Given an OR instruction, check to see if this is a bswap.
@@ -2594,13 +2583,8 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
       }
     }
 
-    // Try to fold constant and into select arguments.
-    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
-      if (Instruction *R = FoldOpIntoSelect(I, SI))
-        return R;
-    if (isa<PHINode>(Op0))
-      if (Instruction *NV = FoldOpIntoPhi(I))
-        return NV;
+    if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
+      return FoldedLogic;
   }
 
   BinaryOperator *Op1I = dyn_cast<BinaryOperator>(Op1);
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index b29ed3c87451..2ef82ba3ed8c 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1599,21 +1599,17 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // fma fneg(x), fneg(y), z -> fma x, y, z
     if (match(Src0, m_FNeg(m_Value(LHS))) &&
         match(Src1, m_FNeg(m_Value(RHS)))) {
-      CallInst *NewCall = Builder->CreateCall(II->getCalledFunction(),
-                                              {LHS, RHS, II->getArgOperand(2)});
-      NewCall->takeName(II);
-      NewCall->copyFastMathFlags(II);
-      return replaceInstUsesWith(*II, NewCall);
+      II->setArgOperand(0, LHS);
+      II->setArgOperand(1, RHS);
+      return II;
     }
 
     // fma fabs(x), fabs(x), z -> fma x, x, z
     if (match(Src0, m_Intrinsic<Intrinsic::fabs>(m_Value(LHS))) &&
         match(Src1, m_Intrinsic<Intrinsic::fabs>(m_Value(RHS))) && LHS == RHS) {
-      CallInst *NewCall = Builder->CreateCall(II->getCalledFunction(),
-                                              {LHS, LHS, II->getArgOperand(2)});
-      NewCall->takeName(II);
-      NewCall->copyFastMathFlags(II);
-      return replaceInstUsesWith(*II, NewCall);
+      II->setArgOperand(0, LHS);
+      II->setArgOperand(1, RHS);
+      return II;
     }
 
     // fma x, 1, z -> fadd x, z
@@ -2760,6 +2756,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     if (KnownOne.isAllOnesValue())
       return eraseInstFromFunction(*II);
 
+    // Update the cache of affected values for this assumption (we might be
+    // here because we just simplified the condition).
+    AC.updateAffectedValues(II);
     break;
   }
   case Intrinsic::experimental_gc_relocate: {
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index 3cefe715e567..2847ce858e79 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -320,7 +320,6 @@ private:
   Value *dyn_castFNegVal(Value *V, bool NoSignedZero = false) const;
   Type *FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
                             SmallVectorImpl<Value *> &NewIndices);
-  Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
 
   /// Classify whether a cast is worth optimizing.
   ///
@@ -537,13 +536,21 @@ private:
   Value *SimplifyVectorOp(BinaryOperator &Inst);
   Value *SimplifyBSwap(BinaryOperator &Inst);
 
-  // FoldOpIntoPhi - Given a binary operator, cast instruction, or select
-  // which has a PHI node as operand #0, see if we can fold the instruction
-  // into the PHI (which is only possible if all operands to the PHI are
-  // constants).
-  //
+
+  /// Given a binary operator, cast instruction, or select which has a PHI node
+  /// as operand #0, see if we can fold the instruction into the PHI (which is
+  /// only possible if all operands to the PHI are constants).
   Instruction *FoldOpIntoPhi(Instruction &I);
 
+  /// Given an instruction with a select as one operand and a constant as the
+  /// other operand, try to fold the binary operator into the select arguments.
+  /// This also works for Cast instructions, which obviously do not have a
+  /// second operand.
+  Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
+
+  /// This is a convenience wrapper function for the above two functions.
+  Instruction *foldOpWithConstantIntoOperand(Instruction &I);
+
   /// \brief Try to rotate an operation below a PHI node, using PHI nodes for
   /// its operands.
   Instruction *FoldPHIArgOpIntoPHI(PHINode &PN);
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index ac64671725f3..45a19fb0f1f2 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -267,14 +267,8 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
 
   // Simplify mul instructions with a constant RHS.
   if (isa<Constant>(Op1)) {
-    // Try to fold constant mul into select arguments.
-    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
-      if (Instruction *R = FoldOpIntoSelect(I, SI))
-        return R;
-
-    if (isa<PHINode>(Op0))
-      if (Instruction *NV = FoldOpIntoPhi(I))
-        return NV;
+    if (Instruction *FoldedMul = foldOpWithConstantIntoOperand(I))
+      return FoldedMul;
 
     // Canonicalize (X+C1)*CI -> X*CI+C1*CI.
     {
@@ -626,14 +620,8 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
 
   // Simplify mul instructions with a constant RHS.
   if (isa<Constant>(Op1)) {
-    // Try to fold constant mul into select arguments.
-    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
-      if (Instruction *R = FoldOpIntoSelect(I, SI))
-        return R;
-
-    if (isa<PHINode>(Op0))
-      if (Instruction *NV = FoldOpIntoPhi(I))
-        return NV;
+    if (Instruction *FoldedMul = foldOpWithConstantIntoOperand(I))
+      return FoldedMul;
 
     // (fmul X, -1.0) --> (fsub -0.0, X)
     if (match(Op1, m_SpecificFP(-1.0))) {
@@ -956,14 +944,9 @@ Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
         }
       }
 
-      if (*C2 != 0) { // avoid X udiv 0
-        if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
-          if (Instruction *R = FoldOpIntoSelect(I, SI))
-            return R;
-        if (isa<PHINode>(Op0))
-          if (Instruction *NV = FoldOpIntoPhi(I))
-            return NV;
-      }
+      if (*C2 != 0) // avoid X udiv 0
+        if (Instruction *FoldedDiv = foldOpWithConstantIntoOperand(I))
+          return FoldedDiv;
     }
   }
 
@@ -1443,6 +1426,16 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
     }
   }
 
+  Value *LHS;
+  Value *RHS;
+
+  // -x / -y -> x / y
+  if (match(Op0, m_FNeg(m_Value(LHS))) && match(Op1, m_FNeg(m_Value(RHS)))) {
+    I.setOperand(0, LHS);
+    I.setOperand(1, RHS);
+    return &I;
+  }
+
   return nullptr;
 }
 
diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 184897f751fe..4cbffe9533b7 100644
--- a/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -29,7 +29,7 @@ using namespace llvm::PatternMatch;
 /// locations of the original PHI node arguments.
 DebugLoc InstCombiner::PHIArgMergedDebugLoc(PHINode &PN) {
   auto *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
-  DILocation *Loc = FirstInst->getDebugLoc();
+  const DILocation *Loc = FirstInst->getDebugLoc();
 
   for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
     auto *I = cast<Instruction>(PN.getIncomingValue(i));
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 5ad2a1c0e3e6..4ff9b64ac57c 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -530,13 +530,8 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
         return BinaryOperator::CreateMul(BO->getOperand(0),
                                          ConstantExpr::getShl(BOOp, Op1));
 
-  // Try to fold constant and into select arguments.
-  if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
-    if (Instruction *R = FoldOpIntoSelect(I, SI))
-      return R;
-  if (isa<PHINode>(Op0))
-    if (Instruction *NV = FoldOpIntoPhi(I))
-      return NV;
+  if (Instruction *FoldedShift = foldOpWithConstantIntoOperand(I))
+    return FoldedShift;
 
   // Fold shift2(trunc(shift1(x,c1)), c2) -> trunc(shift2(shift1(x,c1),c2))
   if (TruncInst *TI = dyn_cast<TruncInst>(Op0)) {
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 9a52874c4c21..27fc34d23175 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -770,10 +770,6 @@ static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO,
   return RI;
 }
 
-/// Given an instruction with a select as one operand and a constant as the
-/// other operand, try to fold the binary operator into the select arguments.
-/// This also works for Cast instructions, which obviously do not have a second
-/// operand.
 Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) {
   // Don't modify shared select instructions.
   if (!SI->hasOneUse())
@@ -824,9 +820,6 @@ Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) {
   return SelectInst::Create(SI->getCondition(), NewTV, NewFV, "", nullptr, SI);
 }
 
-/// Given a binary operator, cast instruction, or select which has a PHI node as
-/// operand #0, see if we can fold the instruction into the PHI (which is only
-/// possible if all operands to the PHI are constants).
 Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
   PHINode *PN = cast<PHINode>(I.getOperand(0));
   unsigned NumPHIValues = PN->getNumIncomingValues();
@@ -964,6 +957,19 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
   return replaceInstUsesWith(I, NewPN);
 }
 
+Instruction *InstCombiner::foldOpWithConstantIntoOperand(Instruction &I) {
+  assert(isa<Constant>(I.getOperand(1)) && "Unexpected operand type");
+
+  if (auto *Sel = dyn_cast<SelectInst>(I.getOperand(0))) {
+    if (Instruction *NewSel = FoldOpIntoSelect(I, Sel))
+      return NewSel;
+  } else if (isa<PHINode>(I.getOperand(0))) {
+    if (Instruction *NewPhi = FoldOpIntoPhi(I))
+      return NewPhi;
+  }
+  return nullptr;
+}
+
 /// Given a pointer type and a constant offset, determine whether or not there
 /// is a sequence of GEP indices into the pointed type that will land us at the
 /// specified offset. If so, fill them into NewIndices and return the resultant
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 54bdc9e0772b..9c4b417e35e1 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1598,8 +1598,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
       StructType::get(IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy,
                       IntptrTy, IntptrTy, IntptrTy, nullptr);
   unsigned SizeOfGlobalStruct = DL.getTypeAllocSize(GlobalStructTy);
-  assert((isPowerOf2_32(SizeOfGlobalStruct) ||
-          !TargetTriple.isOSBinFormatCOFF()) &&
+  assert(isPowerOf2_32(SizeOfGlobalStruct) &&
          "global metadata will not be padded appropriately");
   SmallVector<Constant *, 16> Initializers(UseMetadataArray ? n : 0);
 
@@ -1766,13 +1765,11 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
                              GlobalValue::getRealLinkageName(G->getName()));
     Metadata->setSection(getGlobalMetadataSection());
 
+    // We don't want any padding, but we also need a reasonable alignment.
     // The MSVC linker always inserts padding when linking incrementally. We
     // cope with that by aligning each struct to its size, which must be a power
     // of two.
-    if (TargetTriple.isOSBinFormatCOFF())
-      Metadata->setAlignment(SizeOfGlobalStruct);
-    else
-      Metadata->setAlignment(1); // Don't leave padding in between.
+    Metadata->setAlignment(SizeOfGlobalStruct);
 
     // On platforms that support comdats, put the metadata and the
     // instrumented global in the same group. This ensures that the metadata
diff --git a/lib/Transforms/Instrumentation/InstrProfiling.cpp b/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 8da3e31200f3..adea7e772447 100644
--- a/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -32,6 +32,11 @@ cl::opt<bool> DoNameCompression("enable-name-compression",
                                 cl::desc("Enable name string compression"),
                                 cl::init(true));
 
+cl::opt<bool> DoHashBasedCounterSplit(
+    "hash-based-counter-split",
+    cl::desc("Rename counter variable of a comdat function based on cfg hash"),
+    cl::init(true));
+
 cl::opt<bool> ValueProfileStaticAlloc(
     "vp-static-alloc",
     cl::desc("Do static counter allocation for value profiler"),
@@ -272,7 +277,16 @@ void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageNamesVar) {
 static std::string getVarName(InstrProfIncrementInst *Inc, StringRef Prefix) {
   StringRef NamePrefix = getInstrProfNameVarPrefix();
   StringRef Name = Inc->getName()->getName().substr(NamePrefix.size());
-  return (Prefix + Name).str();
+  Function *F = Inc->getParent()->getParent();
+  Module *M = F->getParent();
+  if (!DoHashBasedCounterSplit || !isIRPGOFlagSet(M) ||
+      !canRenameComdatFunc(*F))
+    return (Prefix + Name).str();
+  uint64_t FuncHash = Inc->getHash()->getZExtValue();
+  SmallVector<char, 24> HashPostfix;
+  if (Name.endswith((Twine(".") + Twine(FuncHash)).toStringRef(HashPostfix)))
+    return (Prefix + Name).str();
+  return (Prefix + Name + "." + Twine(FuncHash)).str();
 }
 
 static inline bool shouldRecordFunctionAddr(Function *F) {
diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 28f4f7ea1455..04f9a64bef9f 100644
--- a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -119,7 +119,7 @@ static cl::opt<unsigned> MaxNumAnnotations(
 // Command line option to control appending FunctionHash to the name of a COMDAT
 // function. This is to avoid the hash mismatch caused by the preinliner.
 static cl::opt<bool> DoComdatRenaming(
-    "do-comdat-renaming", cl::init(true), cl::Hidden,
+    "do-comdat-renaming", cl::init(false), cl::Hidden,
     cl::desc("Append function hash to the name of COMDAT function to avoid "
              "function hash mismatch due to the preinliner"));
 
@@ -134,6 +134,12 @@ static cl::opt<bool> PGOWarnMissing("pgo-warn-missing-function",
 static cl::opt<bool> NoPGOWarnMismatch("no-pgo-warn-mismatch", cl::init(false),
                                        cl::Hidden);
 
+// Command line option to enable/disable the warning about a hash mismatch in
+// the profile data for Comdat functions, which often turns out to be false
+// positive due to the pre-instrumentation inline.
+static cl::opt<bool> NoPGOWarnMismatchComdat("no-pgo-warn-mismatch-comdat",
+                                             cl::init(true), cl::Hidden);
+
 // Command line option to enable/disable select instruction instrumentation.
 static cl::opt<bool> PGOInstrSelect("pgo-instr-select", cl::init(true),
                                     cl::Hidden);
@@ -407,21 +413,9 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
 static bool canRenameComdat(
     Function &F,
     std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers) {
-  if (F.getName().empty())
-    return false;
-  if (!needsComdatForCounter(F, *(F.getParent())))
-    return false;
-  // Only safe to do if this function may be discarded if it is not used
-  // in the compilation unit.
-  if (!GlobalValue::isDiscardableIfUnused(F.getLinkage()))
+  if (!DoComdatRenaming || !canRenameComdatFunc(F, true))
     return false;
 
-  // For AvailableExternallyLinkage functions.
-  if (!F.hasComdat()) {
-    assert(F.getLinkage() == GlobalValue::AvailableExternallyLinkage);
-    return true;
-  }
-
   // FIXME: Current only handle those Comdat groups that only containing one
   // function and function aliases.
   // (1) For a Comdat group containing multiple functions, we need to have a
@@ -803,7 +797,11 @@ bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader) {
       } else if (Err == instrprof_error::hash_mismatch ||
                  Err == instrprof_error::malformed) {
         NumOfPGOMismatch++;
-        SkipWarning = NoPGOWarnMismatch;
+        SkipWarning =
+            NoPGOWarnMismatch ||
+            (NoPGOWarnMismatchComdat &&
+             (F.hasComdat() ||
+              F.getLinkage() == GlobalValue::AvailableExternallyLinkage));
       }
 
       if (SkipWarning)
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index 56df77f03028..06d3d6a73954 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -13,10 +13,12 @@ add_llvm_library(LLVMScalarOpts
   GuardWidening.cpp
   GVN.cpp
   GVNHoist.cpp
+  IVUsersPrinter.cpp
   InductiveRangeCheckElimination.cpp
   IndVarSimplify.cpp
   JumpThreading.cpp
   LICM.cpp
+  LoopAccessAnalysisPrinter.cpp
   LoopSink.cpp
   LoadCombine.cpp
   LoopDeletion.cpp
@@ -26,6 +28,7 @@ add_llvm_library(LLVMScalarOpts
   LoopInstSimplify.cpp
   LoopInterchange.cpp
   LoopLoadElimination.cpp
+  LoopPassManager.cpp
   LoopRerollPass.cpp
   LoopRotation.cpp
   LoopSimplifyCFG.cpp
diff --git a/lib/Transforms/Scalar/IVUsersPrinter.cpp b/lib/Transforms/Scalar/IVUsersPrinter.cpp
new file mode 100644
index 000000000000..807593379283
--- /dev/null
+++ b/lib/Transforms/Scalar/IVUsersPrinter.cpp
@@ -0,0 +1,22 @@
+//===- IVUsersPrinter.cpp - Induction Variable Users Printer ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/IVUsersPrinter.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "iv-users"
+
+PreservedAnalyses IVUsersPrinterPass::run(Loop &L, LoopAnalysisManager &AM,
+                                          LoopStandardAnalysisResults &AR,
+                                          LPMUpdater &U) {
+  AM.getResult<IVUsersAnalysis>(L, AR).print(OS);
+  return PreservedAnalyses::all();
+}
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 68faa886060a..1752fb75eb1b 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -25,15 +25,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/IndVarSimplify.h"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/BasicBlock.h"
@@ -49,6 +47,8 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
@@ -2096,7 +2096,7 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,
     return Builder.CreateGEP(nullptr, GEPBase, GEPOffset, "lftr.limit");
   } else {
     // In any other case, convert both IVInit and IVCount to integers before
-    // comparing. This may result in SCEV expension of pointers, but in practice
+    // comparing. This may result in SCEV expansion of pointers, but in practice
     // SCEV will fold the pointer arithmetic away as such:
     // BECount = (IVEnd - IVInit - 1) => IVLimit = IVInit (postinc).
     //
@@ -2482,23 +2482,13 @@ bool IndVarSimplify::run(Loop *L) {
   return Changed;
 }
 
-PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM) {
-  auto &FAM = AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
+PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
+                                          LoopStandardAnalysisResults &AR,
+                                          LPMUpdater &) {
   Function *F = L.getHeader()->getParent();
   const DataLayout &DL = F->getParent()->getDataLayout();
 
-  auto *LI = FAM.getCachedResult<LoopAnalysis>(*F);
-  auto *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(*F);
-  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
-
-  assert((LI && SE && DT) &&
-         "Analyses required for indvarsimplify not available!");
-
-  // Optional analyses.
-  auto *TTI = FAM.getCachedResult<TargetIRAnalysis>(*F);
-  auto *TLI = FAM.getCachedResult<TargetLibraryAnalysis>(*F);
-
-  IndVarSimplify IVS(LI, SE, DT, DL, TLI, TTI);
+  IndVarSimplify IVS(&AR.LI, &AR.SE, &AR.DT, DL, &AR.TLI, &AR.TTI);
   if (!IVS.run(&L))
     return PreservedAnalyses::all();
 
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 6ef9d0561322..4c15c8a32bec 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -41,8 +41,8 @@
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -61,6 +61,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -84,14 +85,17 @@ static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
 static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
                             const LoopSafetyInfo *SafetyInfo);
 static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
-                  const LoopSafetyInfo *SafetyInfo);
+                  const LoopSafetyInfo *SafetyInfo,
+                  OptimizationRemarkEmitter *ORE);
 static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
                  const Loop *CurLoop, AliasSetTracker *CurAST,
-                 const LoopSafetyInfo *SafetyInfo);
-static bool isSafeToExecuteUnconditionally(const Instruction &Inst,
+                 const LoopSafetyInfo *SafetyInfo,
+                 OptimizationRemarkEmitter *ORE);
+static bool isSafeToExecuteUnconditionally(Instruction &Inst,
                                            const DominatorTree *DT,
                                            const Loop *CurLoop,
                                            const LoopSafetyInfo *SafetyInfo,
+                                           OptimizationRemarkEmitter *ORE,
                                            const Instruction *CtxI = nullptr);
 static bool pointerInvalidatedByLoop(Value *V, uint64_t Size,
                                      const AAMDNodes &AAInfo,
@@ -104,7 +108,8 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
 namespace {
 struct LoopInvariantCodeMotion {
   bool runOnLoop(Loop *L, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT,
-                 TargetLibraryInfo *TLI, ScalarEvolution *SE, bool DeleteAST);
+                 TargetLibraryInfo *TLI, ScalarEvolution *SE,
+                 OptimizationRemarkEmitter *ORE, bool DeleteAST);
 
   DenseMap<Loop *, AliasSetTracker *> &getLoopToAliasSetMap() {
     return LoopToAliasSetMap;
@@ -135,12 +140,16 @@ struct LegacyLICMPass : public LoopPass {
     }
 
     auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+    // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
+    // pass.  Function analyses need to be preserved across loop transformations
+    // but ORE cannot be preserved (see comment before the pass definition).
+    OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
     return LICM.runOnLoop(L,
                           &getAnalysis<AAResultsWrapperPass>().getAAResults(),
                           &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
                           &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
                           &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
-                          SE ? &SE->getSE() : nullptr, false);
+                          SE ? &SE->getSE() : nullptr, &ORE, false);
   }
 
   /// This transformation requires natural loop information & requires that
@@ -176,21 +185,20 @@ private:
 };
 }
 
-PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM) {
+PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
+                                LoopStandardAnalysisResults &AR, LPMUpdater &) {
   const auto &FAM =
-      AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
+      AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
   Function *F = L.getHeader()->getParent();
 
-  auto *AA = FAM.getCachedResult<AAManager>(*F);
-  auto *LI = FAM.getCachedResult<LoopAnalysis>(*F);
-  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
-  auto *TLI = FAM.getCachedResult<TargetLibraryAnalysis>(*F);
-  auto *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(*F);
-  assert((AA && LI && DT && TLI && SE) && "Analyses for LICM not available");
+  auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
+  // FIXME: This should probably be optional rather than required.
+  if (!ORE)
+    report_fatal_error("LICM: OptimizationRemarkEmitterAnalysis not "
+                       "cached at a higher level");
 
   LoopInvariantCodeMotion LICM;
-
-  if (!LICM.runOnLoop(&L, AA, LI, DT, TLI, SE, true))
+  if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.SE, ORE, true))
     return PreservedAnalyses::all();
 
   // FIXME: There is no setPreservesCFG in the new PM. When that becomes
@@ -217,7 +225,9 @@ Pass *llvm::createLICMPass() { return new LegacyLICMPass(); }
 bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
                                         LoopInfo *LI, DominatorTree *DT,
                                         TargetLibraryInfo *TLI,
-                                        ScalarEvolution *SE, bool DeleteAST) {
+                                        ScalarEvolution *SE,
+                                        OptimizationRemarkEmitter *ORE,
+                                        bool DeleteAST) {
   bool Changed = false;
 
   assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
@@ -243,10 +253,10 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
   //
   if (L->hasDedicatedExits())
     Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L,
-                          CurAST, &SafetyInfo);
+                          CurAST, &SafetyInfo, ORE);
   if (Preheader)
     Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L,
-                           CurAST, &SafetyInfo);
+                           CurAST, &SafetyInfo, ORE);
 
   // Now that all loop invariants have been removed from the loop, promote any
   // memory references to scalars that we can.
@@ -279,7 +289,7 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
       for (AliasSet &AS : *CurAST)
         Promoted |=
             promoteLoopAccessesToScalars(AS, ExitBlocks, InsertPts, PIC, LI, DT,
-                                         TLI, L, CurAST, &SafetyInfo);
+                                         TLI, L, CurAST, &SafetyInfo, ORE);
 
       // Once we have promoted values across the loop body we have to
       // recursively reform LCSSA as any nested loop may now have values defined
@@ -320,7 +330,8 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
 ///
 bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
                       DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
-                      AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo) {
+                      AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
+                      OptimizationRemarkEmitter *ORE) {
 
   // Verify inputs.
   assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
@@ -336,7 +347,8 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
   bool Changed = false;
   const std::vector<DomTreeNode *> &Children = N->getChildren();
   for (DomTreeNode *Child : Children)
-    Changed |= sinkRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo);
+    Changed |=
+        sinkRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo, ORE);
 
   // Only need to process the contents of this block if it is not part of a
   // subloop (which would already have been processed).
@@ -363,9 +375,9 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
     // operands of the instruction are loop invariant.
     //
     if (isNotUsedInLoop(I, CurLoop, SafetyInfo) &&
-        canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo)) {
+        canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE)) {
       ++II;
-      Changed |= sink(I, LI, DT, CurLoop, CurAST, SafetyInfo);
+      Changed |= sink(I, LI, DT, CurLoop, CurAST, SafetyInfo, ORE);
     }
   }
   return Changed;
@@ -378,7 +390,8 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
 ///
 bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
                        DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
-                       AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo) {
+                       AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
+                       OptimizationRemarkEmitter *ORE) {
   // Verify inputs.
   assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
          CurLoop != nullptr && CurAST != nullptr && SafetyInfo != nullptr &&
@@ -417,16 +430,17 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
       // is safe to hoist the instruction.
       //
       if (CurLoop->hasLoopInvariantOperands(&I) &&
-          canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo) &&
+          canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE) &&
           isSafeToExecuteUnconditionally(
-              I, DT, CurLoop, SafetyInfo,
+              I, DT, CurLoop, SafetyInfo, ORE,
               CurLoop->getLoopPreheader()->getTerminator()))
-        Changed |= hoist(I, DT, CurLoop, SafetyInfo);
+        Changed |= hoist(I, DT, CurLoop, SafetyInfo, ORE);
     }
 
   const std::vector<DomTreeNode *> &Children = N->getChildren();
   for (DomTreeNode *Child : Children)
-    Changed |= hoistRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo);
+    Changed |=
+        hoistRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo, ORE);
   return Changed;
 }
 
@@ -465,7 +479,8 @@ void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) {
 
 bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
                               Loop *CurLoop, AliasSetTracker *CurAST,
-                              LoopSafetyInfo *SafetyInfo) {
+                              LoopSafetyInfo *SafetyInfo,
+                              OptimizationRemarkEmitter *ORE) {
   // Loads have extra constraints we have to verify before we can hoist them.
   if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
     if (!LI->isUnordered())
@@ -486,7 +501,17 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
     AAMDNodes AAInfo;
     LI->getAAMetadata(AAInfo);
 
-    return !pointerInvalidatedByLoop(LI->getOperand(0), Size, AAInfo, CurAST);
+    bool Invalidated =
+        pointerInvalidatedByLoop(LI->getOperand(0), Size, AAInfo, CurAST);
+    // Check loop-invariant address because this may also be a sinkable load
+    // whose address is not necessarily loop-invariant.
+    if (ORE && Invalidated && CurLoop->isLoopInvariant(LI->getPointerOperand()))
+      ORE->emit(OptimizationRemarkMissed(
+                    DEBUG_TYPE, "LoadWithLoopInvariantAddressInvalidated", LI)
+                << "failed to move load with loop-invariant address "
+                   "because the loop may invalidate its value");
+
+    return !Invalidated;
   } else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
     // Don't sink or hoist dbg info; it's legal, but not useful.
     if (isa<DbgInfoIntrinsic>(I))
@@ -680,8 +705,11 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
 ///
 static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
                  const Loop *CurLoop, AliasSetTracker *CurAST,
-                 const LoopSafetyInfo *SafetyInfo) {
+                 const LoopSafetyInfo *SafetyInfo,
+                 OptimizationRemarkEmitter *ORE) {
   DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
+  ORE->emit(OptimizationRemark(DEBUG_TYPE, "InstSunk", &I)
+            << "sinking " << ore::NV("Inst", &I));
   bool Changed = false;
   if (isa<LoadInst>(I))
     ++NumMovedLoads;
@@ -748,10 +776,13 @@ static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
 /// is safe to hoist, this instruction is called to do the dirty work.
 ///
 static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
-                  const LoopSafetyInfo *SafetyInfo) {
+                  const LoopSafetyInfo *SafetyInfo,
+                  OptimizationRemarkEmitter *ORE) {
   auto *Preheader = CurLoop->getLoopPreheader();
   DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I
                << "\n");
+  ORE->emit(OptimizationRemark(DEBUG_TYPE, "Hoisted", &I)
+            << "hosting " << ore::NV("Inst", &I));
 
   // Metadata can be dependent on conditions we are hoisting above.
   // Conservatively strip all metadata on the instruction unless we were
@@ -786,15 +817,28 @@ static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
 /// Only sink or hoist an instruction if it is not a trapping instruction,
 /// or if the instruction is known not to trap when moved to the preheader.
 /// or if it is a trapping instruction and is guaranteed to execute.
-static bool isSafeToExecuteUnconditionally(const Instruction &Inst,
+static bool isSafeToExecuteUnconditionally(Instruction &Inst,
                                            const DominatorTree *DT,
                                            const Loop *CurLoop,
                                            const LoopSafetyInfo *SafetyInfo,
+                                           OptimizationRemarkEmitter *ORE,
                                            const Instruction *CtxI) {
   if (isSafeToSpeculativelyExecute(&Inst, CtxI, DT))
     return true;
 
-  return isGuaranteedToExecute(Inst, DT, CurLoop, SafetyInfo);
+  bool GuaranteedToExecute =
+      isGuaranteedToExecute(Inst, DT, CurLoop, SafetyInfo);
+
+  if (!GuaranteedToExecute) {
+    auto *LI = dyn_cast<LoadInst>(&Inst);
+    if (LI && CurLoop->isLoopInvariant(LI->getPointerOperand()))
+      ORE->emit(OptimizationRemarkMissed(
+                    DEBUG_TYPE, "LoadWithLoopInvariantAddressCondExecuted", LI)
+                << "failed to hoist load with loop-invariant address "
+                   "because load is conditionally executed");
+  }
+
+  return GuaranteedToExecute;
 }
 
 namespace {
@@ -882,7 +926,8 @@ bool llvm::promoteLoopAccessesToScalars(
     AliasSet &AS, SmallVectorImpl<BasicBlock *> &ExitBlocks,
     SmallVectorImpl<Instruction *> &InsertPts, PredIteratorCache &PIC,
     LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
-    Loop *CurLoop, AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo) {
+    Loop *CurLoop, AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
+    OptimizationRemarkEmitter *ORE) {
   // Verify inputs.
   assert(LI != nullptr && DT != nullptr && CurLoop != nullptr &&
          CurAST != nullptr && SafetyInfo != nullptr &&
@@ -982,14 +1027,14 @@ bool llvm::promoteLoopAccessesToScalars(
 
       // If there is an non-load/store instruction in the loop, we can't promote
       // it.
-      if (const LoadInst *Load = dyn_cast<LoadInst>(UI)) {
+      if (LoadInst *Load = dyn_cast<LoadInst>(UI)) {
         assert(!Load->isVolatile() && "AST broken");
         if (!Load->isSimple())
           return false;
 
         if (!DereferenceableInPH)
           DereferenceableInPH = isSafeToExecuteUnconditionally(
-              *Load, DT, CurLoop, SafetyInfo, Preheader->getTerminator());
+              *Load, DT, CurLoop, SafetyInfo, ORE, Preheader->getTerminator());
       } else if (const StoreInst *Store = dyn_cast<StoreInst>(UI)) {
         // Stores *of* the pointer are not interesting, only stores *to* the
         // pointer.
@@ -1074,6 +1119,9 @@ bool llvm::promoteLoopAccessesToScalars(
   // Otherwise, this is safe to promote, lets do it!
   DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " << *SomePtr
                << '\n');
+  ORE->emit(
+      OptimizationRemark(DEBUG_TYPE, "PromoteLoopAccessesToScalar", LoopUses[0])
+      << "Moving accesses to memory location out of the loop");
   ++NumPromoted;
 
   // Grab a debug location for the inserted loads/stores; given that the
diff --git a/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp b/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp
new file mode 100644
index 000000000000..a64c99117d64
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp
@@ -0,0 +1,25 @@
+//===- LoopAccessAnalysisPrinter.cpp - Loop Access Analysis Printer --------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-accesses"
+
+PreservedAnalyses
+LoopAccessInfoPrinterPass::run(Loop &L, LoopAnalysisManager &AM,
+                               LoopStandardAnalysisResults &AR, LPMUpdater &) {
+  Function &F = *L.getHeader()->getParent();
+  auto &LAI = AM.getResult<LoopAccessAnalysis>(L, AR);
+  OS << "Loop access info in function '" << F.getName() << "':\n";
+  OS.indent(2) << L.getHeader()->getName() << ":\n";
+  LAI.print(OS, 4);
+  return PreservedAnalyses::all();
+}
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
index 187e6e3073c7..cca75a365024 100644
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -19,9 +19,9 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 using namespace llvm;
 
@@ -215,15 +215,10 @@ bool LoopDeletionPass::runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
   return Changed;
 }
 
-PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM) {
-  auto &FAM = AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
-  Function *F = L.getHeader()->getParent();
-
-  auto &DT = *FAM.getCachedResult<DominatorTreeAnalysis>(*F);
-  auto &SE = *FAM.getCachedResult<ScalarEvolutionAnalysis>(*F);
-  auto &LI = *FAM.getCachedResult<LoopAnalysis>(*F);
-
-  bool Changed = runImpl(&L, DT, SE, LI);
+PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM,
+                                        LoopStandardAnalysisResults &AR,
+                                        LPMUpdater &) {
+  bool Changed = runImpl(&L, AR.DT, AR.SE, AR.LI);
   if (!Changed)
     return PreservedAnalyses::all();
 
diff --git a/lib/Transforms/Scalar/LoopDistribute.cpp b/lib/Transforms/Scalar/LoopDistribute.cpp
index b2b2f72aa83d..19716b28ad66 100644
--- a/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -31,13 +31,13 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
@@ -946,10 +946,18 @@ PreservedAnalyses LoopDistributePass::run(Function &F,
   auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
   auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
 
+  // We don't directly need these analyses but they're required for loop
+  // analyses so provide them below.
+  auto &AA = AM.getResult<AAManager>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+
   auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
   std::function<const LoopAccessInfo &(Loop &)> GetLAA =
       [&](Loop &L) -> const LoopAccessInfo & {
-    return LAM.getResult<LoopAccessAnalysis>(L);
+    LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI};
+    return LAM.getResult<LoopAccessAnalysis>(L, AR);
   };
 
   bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, GetLAA);
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 2743574ecca6..5fec51c095d0 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -46,7 +46,6 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -61,6 +60,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
@@ -186,24 +186,12 @@ public:
 };
 } // End anonymous namespace.
 
-PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L,
-                                              LoopAnalysisManager &AM) {
-  const auto &FAM =
-      AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
-  Function *F = L.getHeader()->getParent();
-
-  // Use getCachedResult because Loop pass cannot trigger a function analysis.
-  auto *AA = FAM.getCachedResult<AAManager>(*F);
-  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
-  auto *LI = FAM.getCachedResult<LoopAnalysis>(*F);
-  auto *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(*F);
-  auto *TLI = FAM.getCachedResult<TargetLibraryAnalysis>(*F);
-  const auto *TTI = FAM.getCachedResult<TargetIRAnalysis>(*F);
+PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
+                                              LoopStandardAnalysisResults &AR,
+                                              LPMUpdater &) {
   const auto *DL = &L.getHeader()->getModule()->getDataLayout();
-  assert((AA && DT && LI && SE && TLI && TTI && DL) &&
-         "Analyses for Loop Idiom Recognition not available");
 
-  LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL);
+  LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, DL);
   if (!LIR.runOnLoop(&L))
     return PreservedAnalyses::all();
 
diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp
index f6620ad1ade5..69102d10ff60 100644
--- a/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -18,7 +18,6 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DataLayout.h"
@@ -26,6 +25,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 using namespace llvm;
@@ -183,20 +183,10 @@ public:
 };
 }
 
-PreservedAnalyses LoopInstSimplifyPass::run(Loop &L,
-                                            LoopAnalysisManager &AM) {
-  const auto &FAM =
-      AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
-  Function *F = L.getHeader()->getParent();
-
-  // Use getCachedResult because Loop pass cannot trigger a function analysis.
-  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
-  auto *LI = FAM.getCachedResult<LoopAnalysis>(*F);
-  auto *AC = FAM.getCachedResult<AssumptionAnalysis>(*F);
-  const auto *TLI = FAM.getCachedResult<TargetLibraryAnalysis>(*F);
-  assert((LI && AC && TLI) && "Analyses for Loop Inst Simplify not available");
-
-  if (!SimplifyLoopInst(&L, DT, LI, AC, TLI))
+PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
+                                            LoopStandardAnalysisResults &AR,
+                                            LPMUpdater &) {
+  if (!SimplifyLoopInst(&L, &AR.DT, &AR.LI, &AR.AC, &AR.TLI))
     return PreservedAnalyses::all();
 
   return getLoopPassPreservedAnalyses();
diff --git a/lib/Transforms/Scalar/LoopPassManager.cpp b/lib/Transforms/Scalar/LoopPassManager.cpp
new file mode 100644
index 000000000000..028f4bba8b1d
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopPassManager.cpp
@@ -0,0 +1,85 @@
+//===- LoopPassManager.cpp - Loop pass management -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+
+using namespace llvm;
+
+// Explicit template instantiations and specialization defininitions for core
+// template typedefs.
+namespace llvm {
+template class PassManager<Loop, LoopAnalysisManager,
+                           LoopStandardAnalysisResults &, LPMUpdater &>;
+
+/// Explicitly specialize the pass manager's run method to handle loop nest
+/// structure updates.
+template <>
+PreservedAnalyses
+PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &,
+            LPMUpdater &>::run(Loop &L, LoopAnalysisManager &AM,
+                               LoopStandardAnalysisResults &AR, LPMUpdater &U) {
+  PreservedAnalyses PA = PreservedAnalyses::all();
+
+  if (DebugLogging)
+    dbgs() << "Starting Loop pass manager run.\n";
+
+  for (auto &Pass : Passes) {
+    if (DebugLogging)
+      dbgs() << "Running pass: " << Pass->name() << " on " << L;
+
+    PreservedAnalyses PassPA = Pass->run(L, AM, AR, U);
+
+    // If the loop was deleted, abort the run and return to the outer walk.
+    if (U.skipCurrentLoop()) {
+      PA.intersect(std::move(PassPA));
+      break;
+    }
+
+    // Update the analysis manager as each pass runs and potentially
+    // invalidates analyses.
+    AM.invalidate(L, PassPA);
+
+    // Finally, we intersect the final preserved analyses to compute the
+    // aggregate preserved set for this pass manager.
+    PA.intersect(std::move(PassPA));
+
+    // FIXME: Historically, the pass managers all called the LLVM context's
+    // yield function here. We don't have a generic way to acquire the
+    // context and it isn't yet clear what the right pattern is for yielding
+    // in the new pass manager so it is currently omitted.
+    // ...getContext().yield();
+  }
+
+  // Invalidation for the current loop should be handled above, and other loop
+  // analysis results shouldn't be impacted by runs over this loop. Therefore,
+  // the remaining analysis results in the AnalysisManager are preserved. We
+  // mark this with a set so that we don't need to inspect each one
+  // individually.
+  // FIXME: This isn't correct! This loop and all nested loops' analyses should
+  // be preserved, but unrolling should invalidate the parent loop's analyses.
+  PA.preserveSet<AllAnalysesOn<Loop>>();
+
+  if (DebugLogging)
+    dbgs() << "Finished Loop pass manager run.\n";
+
+  return PA;
+}
+}
+
+PrintLoopPass::PrintLoopPass() : OS(dbgs()) {}
+PrintLoopPass::PrintLoopPass(raw_ostream &OS, const std::string &Banner)
+    : OS(OS), Banner(Banner) {}
+
+PreservedAnalyses PrintLoopPass::run(Loop &L, LoopAnalysisManager &,
+                                     LoopStandardAnalysisResults &,
+                                     LPMUpdater &) {
+  printLoop(L, OS, Banner);
+  return PreservedAnalyses::all();
+}
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index 0225cc325700..cc83069d5f52 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -14,13 +14,12 @@
 #include "llvm/Transforms/Scalar/LoopRotation.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -34,6 +33,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
@@ -625,20 +625,11 @@ bool LoopRotate::processLoop(Loop *L) {
 LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication)
     : EnableHeaderDuplication(EnableHeaderDuplication) {}
 
-PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM) {
-  auto &FAM = AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
-  Function *F = L.getHeader()->getParent();
-
-  auto *LI = FAM.getCachedResult<LoopAnalysis>(*F);
-  const auto *TTI = FAM.getCachedResult<TargetIRAnalysis>(*F);
-  auto *AC = FAM.getCachedResult<AssumptionAnalysis>(*F);
-  assert((LI && TTI && AC) && "Analyses for loop rotation not available");
-
-  // Optional analyses.
-  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
-  auto *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(*F);
+PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
+                                      LoopStandardAnalysisResults &AR,
+                                      LPMUpdater &) {
   int Threshold = EnableHeaderDuplication ? DefaultRotationThreshold : 0;
-  LoopRotate LR(Threshold, LI, TTI, AC, DT, SE);
+  LoopRotate LR(Threshold, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE);
 
   bool Changed = LR.processLoop(&L);
   if (!Changed)
diff --git a/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index d37339fc5fee..16061212ba38 100644
--- a/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -18,18 +18,18 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 using namespace llvm;
@@ -64,16 +64,10 @@ static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI) {
   return Changed;
 }
 
-PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM) {
-  const auto &FAM =
-      AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
-  Function *F = L.getHeader()->getParent();
-
-  auto *LI = FAM.getCachedResult<LoopAnalysis>(*F);
-  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
-  assert((LI && DT) && "Analyses for LoopSimplifyCFG not available");
-
-  if (!simplifyLoopCFG(L, *DT, *LI))
+PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM,
+                                           LoopStandardAnalysisResults &AR,
+                                           LPMUpdater &) {
+  if (!simplifyLoopCFG(L, AR.DT, AR.LI))
     return PreservedAnalyses::all();
   return getLoopPassPreservedAnalyses();
 }
diff --git a/lib/Transforms/Scalar/LoopSink.cpp b/lib/Transforms/Scalar/LoopSink.cpp
index f64354497771..f3f415275c0e 100644
--- a/lib/Transforms/Scalar/LoopSink.cpp
+++ b/lib/Transforms/Scalar/LoopSink.cpp
@@ -38,7 +38,6 @@
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/IR/Dominators.h"
@@ -47,6 +46,7 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 using namespace llvm;
@@ -283,6 +283,9 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
   // sinked.
   for (auto II = Preheader->rbegin(), E = Preheader->rend(); II != E;) {
     Instruction *I = &*II++;
+    // No need to check for instruction's operands are loop invariant.
+    assert(L.hasLoopInvariantOperands(I) &&
+           "Insts in a loop's preheader should have loop invariant operands!");
     if (!canSinkOrHoistInst(*I, &AA, &DT, &L, &CurAST, nullptr))
       continue;
     if (sinkInstruction(L, *I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI))
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index a61f646042ae..a1561fc0a6c2 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -59,16 +59,15 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -80,13 +79,13 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/Operator.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
@@ -99,6 +98,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
@@ -5052,21 +5052,11 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
   return ReduceLoopStrength(L, IU, SE, DT, LI, TTI);
 }
 
-PreservedAnalyses LoopStrengthReducePass::run(Loop &L,
-                                              LoopAnalysisManager &AM) {
-  const auto &FAM =
-      AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
-  Function *F = L.getHeader()->getParent();
-
-  auto &IU = AM.getResult<IVUsersAnalysis>(L);
-  auto *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(*F);
-  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
-  auto *LI = FAM.getCachedResult<LoopAnalysis>(*F);
-  auto *TTI = FAM.getCachedResult<TargetIRAnalysis>(*F);
-  assert((SE && DT && LI && TTI) &&
-         "Analyses for Loop Strength Reduce not available");
-
-  if (!ReduceLoopStrength(&L, IU, *SE, *DT, *LI, *TTI))
+PreservedAnalyses LoopStrengthReducePass::run(Loop &L, LoopAnalysisManager &AM,
+                                              LoopStandardAnalysisResults &AR,
+                                              LPMUpdater &) {
+  if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
+                          AR.DT, AR.LI, AR.TTI))
     return PreservedAnalyses::all();
 
   return getLoopPassPreservedAnalyses();
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index f66369b30369..c7f91226d222 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -19,7 +19,6 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/LoopUnrollAnalyzer.h"
 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -33,6 +32,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
 #include <climits>
@@ -1111,41 +1111,23 @@ Pass *llvm::createSimpleLoopUnrollPass() {
   return llvm::createLoopUnrollPass(-1, -1, 0, 0, 0);
 }
 
-PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM) {
+PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
+                                      LoopStandardAnalysisResults &AR,
+                                      LPMUpdater &) {
   const auto &FAM =
-      AM.getResult<FunctionAnalysisManagerLoopProxy>(L).getManager();
+      AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
   Function *F = L.getHeader()->getParent();
 
-
-  DominatorTree *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
-  LoopInfo *LI = FAM.getCachedResult<LoopAnalysis>(*F);
-  ScalarEvolution *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(*F);
-  auto *TTI = FAM.getCachedResult<TargetIRAnalysis>(*F);
-  auto *AC = FAM.getCachedResult<AssumptionAnalysis>(*F);
   auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
-  if (!DT)
-    report_fatal_error(
-        "LoopUnrollPass: DominatorTreeAnalysis not cached at a higher level");
-  if (!LI)
-    report_fatal_error(
-        "LoopUnrollPass: LoopAnalysis not cached at a higher level");
-  if (!SE)
-    report_fatal_error(
-        "LoopUnrollPass: ScalarEvolutionAnalysis not cached at a higher level");
-  if (!TTI)
-    report_fatal_error(
-        "LoopUnrollPass: TargetIRAnalysis not cached at a higher level");
-  if (!AC)
-    report_fatal_error(
-        "LoopUnrollPass: AssumptionAnalysis not cached at a higher level");
+  // FIXME: This should probably be optional rather than required.
   if (!ORE)
     report_fatal_error("LoopUnrollPass: OptimizationRemarkEmitterAnalysis not "
                        "cached at a higher level");
 
-  bool Changed =
-      tryToUnrollLoop(&L, *DT, LI, SE, *TTI, *AC, *ORE, /*PreserveLCSSA*/ true,
-                      ProvidedCount, ProvidedThreshold, ProvidedAllowPartial,
-                      ProvidedRuntime, ProvidedUpperBound);
+  bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, &AR.SE, AR.TTI, AR.AC, *ORE,
+                                 /*PreserveLCSSA*/ true, ProvidedCount,
+                                 ProvidedThreshold, ProvidedAllowPartial,
+                                 ProvidedRuntime, ProvidedUpperBound);
 
   if (!Changed)
     return PreservedAnalyses::all();
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index eef7db08cd46..e1b6741f31b4 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -135,6 +135,10 @@ struct CongruenceClass {
   // purposes, and for skipping empty classes.
   bool Dead = false;
 
+  // Number of stores in this congruence class.
+  // This is used so we can detect store equivalence changes properly.
+  int StoreCount = 0;
+
   explicit CongruenceClass(unsigned ID) : ID(ID) {}
   CongruenceClass(unsigned ID, Value *Leader, const Expression *E)
       : ID(ID), RepLeader(Leader), DefiningExpr(E) {}
@@ -198,7 +202,7 @@ class NewGVN : public FunctionPass {
   ExpressionClassMap ExpressionToClass;
 
   // Which values have changed as a result of leader changes.
-  SmallPtrSet<Value *, 8> ChangedValues;
+  SmallPtrSet<Value *, 8> LeaderChanges;
 
   // Reachability info.
   using BlockEdge = BasicBlockEdge;
@@ -317,7 +321,8 @@ private:
   template <class T>
   Value *lookupOperandLeader(Value *, const User *, const T &) const;
   void performCongruenceFinding(Value *, const Expression *);
-
+  void moveValueToNewCongruenceClass(Value *, CongruenceClass *,
+                                     CongruenceClass *);
   // Reachability handling.
   void updateReachableEdge(BasicBlock *, BasicBlock *);
   void processOutgoingEdges(TerminatorInst *, BasicBlock *);
@@ -347,7 +352,8 @@ private:
   void cleanupTables();
   std::pair<unsigned, unsigned> assignDFSNumbers(BasicBlock *, unsigned);
   void updateProcessedCount(Value *V);
-  void verifyMemoryCongruency();
+  void verifyMemoryCongruency() const;
+  bool singleReachablePHIPath(const MemoryAccess *, const MemoryAccess *) const;
 };
 
 char NewGVN::ID = 0;
@@ -717,10 +723,10 @@ const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI,
 // Utility function to check whether the congruence class has a member other
 // than the given instruction.
 bool hasMemberOtherThanUs(const CongruenceClass *CC, Instruction *I) {
-  // Either it has more than one member, in which case it must contain something
-  // other than us (because it's indexed by value), or if it only has one member
+  // Either it has more than one store, in which case it must contain something
+  // other than us (because it's indexed by value), or if it only has one store
   // right now, that member should not be us.
-  return CC->Members.size() > 1 || CC->Members.count(I) == 0;
+  return CC->StoreCount > 1 || CC->Members.count(I) == 0;
 }
 
 const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I,
@@ -1044,7 +1050,40 @@ void NewGVN::markLeaderChangeTouched(CongruenceClass *CC) {
   for (auto M : CC->Members) {
     if (auto *I = dyn_cast<Instruction>(M))
       TouchedInstructions.set(InstrDFS[I]);
-    ChangedValues.insert(M);
+    LeaderChanges.insert(M);
+  }
+}
+
+// Move a value, currently in OldClass, to be part of NewClass
+// Update OldClass for the move (including changing leaders, etc)
+void NewGVN::moveValueToNewCongruenceClass(Value *V, CongruenceClass *OldClass,
+                                           CongruenceClass *NewClass) {
+  DEBUG(dbgs() << "New congruence class for " << V << " is " << NewClass->ID
+               << "\n");
+  OldClass->Members.erase(V);
+  NewClass->Members.insert(V);
+  if (isa<StoreInst>(V)) {
+    --OldClass->StoreCount;
+    assert(OldClass->StoreCount >= 0);
+    ++NewClass->StoreCount;
+    assert(NewClass->StoreCount > 0);
+  }
+
+  ValueToClass[V] = NewClass;
+  // See if we destroyed the class or need to swap leaders.
+  if (OldClass->Members.empty() && OldClass != InitialClass) {
+    if (OldClass->DefiningExpr) {
+      OldClass->Dead = true;
+      DEBUG(dbgs() << "Erasing expression " << OldClass->DefiningExpr
+                   << " from table\n");
+      ExpressionToClass.erase(OldClass->DefiningExpr);
+    }
+  } else if (OldClass->RepLeader == V) {
+    // When the leader changes, the value numbering of
+    // everything may change due to symbolization changes, so we need to
+    // reprocess.
+    OldClass->RepLeader = *(OldClass->Members.begin());
+    markLeaderChangeTouched(OldClass);
   }
 }
 
@@ -1101,33 +1140,16 @@ void NewGVN::performCongruenceFinding(Value *V, const Expression *E) {
       assert(!EClass->Dead && "We accidentally looked up a dead class");
     }
   }
-  bool WasInChanged = ChangedValues.erase(V);
-  if (VClass != EClass || WasInChanged) {
+  bool ClassChanged = VClass != EClass;
+  bool LeaderChanged = LeaderChanges.erase(V);
+  if (ClassChanged || LeaderChanged) {
     DEBUG(dbgs() << "Found class " << EClass->ID << " for expression " << E
                  << "\n");
 
-    if (VClass != EClass) {
-      DEBUG(dbgs() << "New congruence class for " << V << " is " << EClass->ID
-                   << "\n");
-
-      VClass->Members.erase(V);
-      EClass->Members.insert(V);
-      ValueToClass[V] = EClass;
-      // See if we destroyed the class or need to swap leaders.
-      if (VClass->Members.empty() && VClass != InitialClass) {
-        if (VClass->DefiningExpr) {
-          VClass->Dead = true;
-          DEBUG(dbgs() << "Erasing expression " << *E << " from table\n");
-          ExpressionToClass.erase(VClass->DefiningExpr);
-        }
-      } else if (VClass->RepLeader == V) {
-        // When the leader changes, the value numbering of
-        // everything may change due to symbolization changes, so we need to
-        // reprocess.
-        VClass->RepLeader = *(VClass->Members.begin());
-        markLeaderChangeTouched(VClass);
-      }
-    }
+    if (ClassChanged)
+
+      moveValueToNewCongruenceClass(V, VClass, EClass);
+
 
     markUsersTouched(V);
     if (auto *I = dyn_cast<Instruction>(V)) {
@@ -1315,9 +1337,12 @@ void NewGVN::initializeCongruenceClasses(Function &F) {
       // MemoryDef's for stores and all MemoryPhis to be equal.  Right now, no
       // other expression can generate a memory equivalence.  If we start
       // handling memcpy/etc, we can expand this.
-      if (isa<StoreInst>(&I))
+      if (isa<StoreInst>(&I)) {
         MemoryAccessEquiv.insert(
             {MSSA->getMemoryAccess(&I), MSSA->getLiveOnEntryDef()});
+        ++InitialClass->StoreCount;
+        assert(InitialClass->StoreCount > 0);
+      }
     }
   }
   InitialClass->Members.swap(InitialValues);
@@ -1454,9 +1479,40 @@ void NewGVN::valueNumberInstruction(Instruction *I) {
   }
 }
 
+// Check if there is a path, using single or equal argument phi nodes, from
+// First to Second.
+bool NewGVN::singleReachablePHIPath(const MemoryAccess *First,
+                                    const MemoryAccess *Second) const {
+  if (First == Second)
+    return true;
+
+  if (auto *FirstDef = dyn_cast<MemoryUseOrDef>(First)) {
+    auto *DefAccess = FirstDef->getDefiningAccess();
+    return singleReachablePHIPath(DefAccess, Second);
+  } else {
+    auto *MP = cast<MemoryPhi>(First);
+    auto ReachableOperandPred = [&](const Use &U) {
+      return ReachableBlocks.count(MP->getIncomingBlock(U));
+    };
+    auto FilteredPhiArgs =
+        make_filter_range(MP->operands(), ReachableOperandPred);
+    SmallVector<const Value *, 32> OperandList;
+    std::copy(FilteredPhiArgs.begin(), FilteredPhiArgs.end(),
+              std::back_inserter(OperandList));
+    bool Okay = OperandList.size() == 1;
+    if (!Okay)
+      Okay = std::equal(OperandList.begin(), OperandList.end(),
+                        OperandList.begin());
+    if (Okay)
+      return singleReachablePHIPath(cast<MemoryAccess>(OperandList[0]), Second);
+    return false;
+  }
+}
+
 // Verify the that the memory equivalence table makes sense relative to the
-// congruence classes.
-void NewGVN::verifyMemoryCongruency() {
+// congruence classes.  Note that this checking is not perfect, and is currently
+// subject to very rare false negatives. It is only useful for testing/debugging.
+void NewGVN::verifyMemoryCongruency() const {
   // Anything equivalent in the memory access table should be in the same
   // congruence class.
 
@@ -1483,11 +1539,12 @@ void NewGVN::verifyMemoryCongruency() {
     if (auto *FirstMUD = dyn_cast<MemoryUseOrDef>(KV.first)) {
       auto *SecondMUD = dyn_cast<MemoryUseOrDef>(KV.second);
       if (FirstMUD && SecondMUD)
-        assert(
-            ValueToClass.lookup(FirstMUD->getMemoryInst()) ==
-                ValueToClass.lookup(SecondMUD->getMemoryInst()) &&
-            "The instructions for these memory operations should have been in "
-            "the same congruence class");
+        assert((singleReachablePHIPath(FirstMUD, SecondMUD) ||
+               ValueToClass.lookup(FirstMUD->getMemoryInst()) ==
+                       ValueToClass.lookup(SecondMUD->getMemoryInst())) &&
+                   "The instructions for these memory operations should have "
+                   "been in the same congruence class or reachable through"
+                   "a single argument phi");
     } else if (auto *FirstMP = dyn_cast<MemoryPhi>(KV.first)) {
 
       // We can only sanely verify that MemoryDefs in the operand list all have
diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp
index fa2235e8439a..49ce0262c97b 100644
--- a/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -792,6 +792,7 @@ void StructurizeCFG::handleLoops(bool ExitUseAllowed,
                          LoopFunc,
                          LoopStart);
     BranchInst::Create(LoopStart, NewEntry);
+    DT->setNewRoot(NewEntry);
   }
 
   // Create an extra loop end node
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index e551e4b47ac1..f9a602bc268a 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -172,6 +172,36 @@ static bool needToInsertPhisForLCSSA(Loop *L, std::vector<BasicBlock *> Blocks,
   return false;
 }
 
+/// Adds ClonedBB to LoopInfo, creates a new loop for ClonedBB if necessary
+/// and adds a mapping from the original loop to the new loop to NewLoops.
+/// Returns nullptr if no new loop was created and a pointer to the
+/// original loop OriginalBB was part of otherwise.
+const Loop* llvm::addClonedBlockToLoopInfo(BasicBlock *OriginalBB,
+                                           BasicBlock *ClonedBB, LoopInfo *LI,
+                                           NewLoopsMap &NewLoops) {
+  // Figure out which loop New is in.
+  const Loop *OldLoop = LI->getLoopFor(OriginalBB);
+  assert(OldLoop && "Should (at least) be in the loop being unrolled!");
+
+  Loop *&NewLoop = NewLoops[OldLoop];
+  if (!NewLoop) {
+    // Found a new sub-loop.
+    assert(OriginalBB == OldLoop->getHeader() &&
+           "Header should be first in RPO");
+
+    Loop *NewLoopParent = NewLoops.lookup(OldLoop->getParentLoop());
+    assert(NewLoopParent &&
+           "Expected parent loop before sub-loop in RPO");
+    NewLoop = new Loop;
+    NewLoopParent->addChildLoop(NewLoop);
+    NewLoop->addBasicBlockToLoop(ClonedBB, *LI);
+    return OldLoop;
+  } else {
+    NewLoop->addBasicBlockToLoop(ClonedBB, *LI);
+    return nullptr;
+  }
+}
+
 /// Unroll the given loop by Count. The loop must be in LCSSA form. Returns true
 /// if unrolling was successful, or false if the loop was unmodified. Unrolling
 /// can only fail when the loop's latch block is not terminated by a conditional
@@ -428,28 +458,14 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
         assert(LI->getLoopFor(*BB) == L && "Header should not be in a sub-loop");
         L->addBasicBlockToLoop(New, *LI);
       } else {
-        // Figure out which loop New is in.
-        const Loop *OldLoop = LI->getLoopFor(*BB);
-        assert(OldLoop && "Should (at least) be in the loop being unrolled!");
-
-        Loop *&NewLoop = NewLoops[OldLoop];
-        if (!NewLoop) {
-          // Found a new sub-loop.
-          assert(*BB == OldLoop->getHeader() &&
-                 "Header should be first in RPO");
-
-          Loop *NewLoopParent = NewLoops.lookup(OldLoop->getParentLoop());
-          assert(NewLoopParent &&
-                 "Expected parent loop before sub-loop in RPO");
-          NewLoop = new Loop;
-          NewLoopParent->addChildLoop(NewLoop);
-          LoopsToSimplify.insert(NewLoop);
+        const Loop *OldLoop = addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
+        if (OldLoop) {
+          LoopsToSimplify.insert(NewLoops[OldLoop]);
 
           // Forget the old loop, since its inputs may have changed.
           if (SE)
             SE->forgetLoop(OldLoop);
         }
-        NewLoop->addBasicBlockToLoop(New, *LI);
       }
 
       if (*BB == Header)
diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 5758a415f12b..85da3ba899a5 100644
--- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -301,15 +301,17 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter,
       LI->addTopLevelLoop(NewLoop);
   }
 
+  NewLoopsMap NewLoops;
+  NewLoops[L] = NewLoop;
   // For each block in the original loop, create a new copy,
   // and update the value map with the newly created values.
   for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
     BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, "." + suffix, F);
     NewBlocks.push_back(NewBB);
 
-    if (NewLoop)
-      NewLoop->addBasicBlockToLoop(NewBB, *LI);
-    else if (ParentLoop)
+    if (NewLoop) {
+      addClonedBlockToLoopInfo(*BB, NewBB, LI, NewLoops);
+    } else if (ParentLoop)
       ParentLoop->addBasicBlockToLoop(NewBB, *LI);
 
     VMap[*BB] = NewBB;
diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp
index 09e9f1ddc7fe..c8efa9efc7f3 100644
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@@ -869,8 +869,13 @@ bool InductionDescriptor::isInductionPHI(PHINode *Phi, const Loop *TheLoop,
     return false;
   }
 
-  assert(TheLoop->getHeader() == Phi->getParent() &&
-         "PHI is an AddRec for a different loop?!");
+  if (AR->getLoop() != TheLoop) {
+    // FIXME: We should treat this as a uniform. Unfortunately, we
+    // don't currently know how to handled uniform PHIs.
+    DEBUG(dbgs() << "LV: PHI is a recurrence with respect to an outer loop.\n");
+    return false;    
+  }
+
   Value *StartValue =
     Phi->getIncomingValueForBlock(AR->getLoop()->getLoopPreheader());
   const SCEV *Step = AR->getStepRecurrence(*SE);
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 54390e77bb1f..6e30919246c7 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1275,10 +1275,9 @@ static bool HoistThenElseCodeToIf(BranchInst *BI,
                            LLVMContext::MD_mem_parallel_loop_access};
     combineMetadata(I1, I2, KnownIDs);
 
-    // If the debug loc for I1 and I2 are different, as we are combining them
-    // into one instruction, we do not want to select debug loc randomly from 
-    // I1 or I2.
-    if (!isa<CallInst>(I1) &&  I1->getDebugLoc() != I2->getDebugLoc())
+    // I1 and I2 are being combined into a single instruction.  Its debug
+    // location is the merged locations of the original instructions.
+    if (!isa<CallInst>(I1))
       I1->setDebugLoc(
           DILocation::getMergedLocation(I1->getDebugLoc(), I2->getDebugLoc()));
  
@@ -1577,7 +1576,7 @@ static bool sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) {
   // The debug location for the "common" instruction is the merged locations of
   // all the commoned instructions.  We start with the original location of the
   // "common" instruction and iteratively merge each location in the loop below.
-  DILocation *Loc = I0->getDebugLoc();
+  const DILocation *Loc = I0->getDebugLoc();
 
   // Update metadata and IR flags, and merge debug locations.
   for (auto *I : Insts)
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 11d54bcf4f89..8eaeb1073a76 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1074,6 +1074,24 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
   if (Op2C->getValueAPF().isZero()) // pow(x, 0.0) -> 1.0
     return ConstantFP::get(CI->getType(), 1.0);
 
+  if (Op2C->isExactlyValue(-0.5) &&
+      hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::sqrt, LibFunc::sqrtf,
+                      LibFunc::sqrtl)) {
+    // If -ffast-math:
+    // pow(x, -0.5) -> 1.0 / sqrt(x)
+    if (CI->hasUnsafeAlgebra()) {
+      IRBuilder<>::FastMathFlagGuard Guard(B);
+      B.setFastMathFlags(CI->getFastMathFlags());
+
+      // Here we cannot lower to an intrinsic because C99 sqrt() and llvm.sqrt
+      // are not guaranteed to have the same semantics.
+      Value *Sqrt = emitUnaryFloatFnCall(Op1, TLI->getName(LibFunc::sqrt), B,
+                                         Callee->getAttributes());
+
+      return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), Sqrt, "sqrtrecip");
+    }
+  }
+
   if (Op2C->isExactlyValue(0.5) &&
       hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::sqrt, LibFunc::sqrtf,
                       LibFunc::sqrtl) &&
@@ -1121,6 +1139,10 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
         !V.isInteger())
       return nullptr;
 
+    // Propagate fast math flags.
+    IRBuilder<>::FastMathFlagGuard Guard(B);
+    B.setFastMathFlags(CI->getFastMathFlags());
+
     // We will memoize intermediate products of the Addition Chain.
     Value *InnerChain[33] = {nullptr};
     InnerChain[1] = Op1;
@@ -1131,7 +1153,6 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
     bool ignored;
     V.convert(APFloat::IEEEdouble(), APFloat::rmTowardZero, &ignored);
     
-    // TODO: Should the new instructions propagate the 'fast' flag of the pow()?
     Value *FMul = getPow(InnerChain, V.convertToDouble(), B);
     // For negative exponents simply compute the reciprocal.
     if (Op2C->isNegative())
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 578c65daf7c0..1b1f86f8efdc 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -80,6 +80,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/IR/Verifier.h"
@@ -6949,9 +6950,9 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     } else if (Legal->isUniform(Op2)) {
       Op2VK = TargetTransformInfo::OK_UniformValue;
     }
-
-    return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK,
-                                      Op1VP, Op2VP);
+    SmallVector<const Value *, 4> Operands(I->operand_values()); 
+    return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK,
+                                      Op2VK, Op1VP, Op2VP, Operands);
   }
   case Instruction::Select: {
     SelectInst *SI = cast<SelectInst>(I);
@@ -7641,7 +7642,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
-    auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
+    auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
     auto &AA = AM.getResult<AAManager>(F);
     auto &AC = AM.getResult<AssumptionAnalysis>(F);
     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
@@ -7650,10 +7651,11 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
         [&](Loop &L) -> const LoopAccessInfo & {
-      return LAM.getResult<LoopAccessAnalysis>(L);
+      LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI};
+      return LAM.getResult<LoopAccessAnalysis>(L, AR);
     };
     bool Changed =
-        runImpl(F, SE, LI, TTI, DT, BFI, TLI, DB, AA, AC, GetLAA, ORE);
+        runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE);
     if (!Changed)
       return PreservedAnalyses::all();
     PreservedAnalyses PA;
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index bcaa8439cffa..1c7cbc7edf9a 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2493,10 +2493,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       Value *LHS = vectorizeTree(LHSVL);
       Value *RHS = vectorizeTree(RHSVL);
 
-      if (LHS == RHS && isa<Instruction>(LHS)) {
-        assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order");
-      }
-
       if (Value *V = alreadyVectorized(E->Scalars))
         return V;
 
diff --git a/lib/XRay/CMakeLists.txt b/lib/XRay/CMakeLists.txt
new file mode 100644
index 000000000000..6c1acba79bfa
--- /dev/null
+++ b/lib/XRay/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_llvm_library(LLVMXRay
+  Trace.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/ADT
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/XRay
+
+  DEPENDS
+  LLVMSupport
+
+  LINK_LIBS
+  LLVMSupport
+  )
diff --git a/lib/XRay/Trace.cpp b/lib/XRay/Trace.cpp
new file mode 100644
index 000000000000..51000c777de8
--- /dev/null
+++ b/lib/XRay/Trace.cpp
@@ -0,0 +1,196 @@
+//===- Trace.cpp - XRay Trace Loading implementation. ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// XRay log reader implementation.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/XRay/Trace.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/XRay/YAMLXRayRecord.h"
+
+using namespace llvm;
+using namespace llvm::xray;
+using llvm::yaml::Input;
+
+using XRayRecordStorage =
+    std::aligned_storage<sizeof(XRayRecord), alignof(XRayRecord)>::type;
+
+Error NaiveLogLoader(StringRef Data, XRayFileHeader &FileHeader,
+                     std::vector<XRayRecord> &Records) {
+  // FIXME: Maybe deduce whether the data is little or big-endian using some
+  // magic bytes in the beginning of the file?
+
+  // First 32 bytes of the file will always be the header. We assume a certain
+  // format here:
+  //
+  //   (2)   uint16 : version
+  //   (2)   uint16 : type
+  //   (4)   uint32 : bitfield
+  //   (8)   uint64 : cycle frequency
+  //   (16)  -      : padding
+  //
+  if (Data.size() < 32)
+    return make_error<StringError>(
+        "Not enough bytes for an XRay log.",
+        std::make_error_code(std::errc::invalid_argument));
+
+  if (Data.size() - 32 == 0 || Data.size() % 32 != 0)
+    return make_error<StringError>(
+        "Invalid-sized XRay data.",
+        std::make_error_code(std::errc::invalid_argument));
+
+  DataExtractor HeaderExtractor(Data, true, 8);
+  uint32_t OffsetPtr = 0;
+  FileHeader.Version = HeaderExtractor.getU16(&OffsetPtr);
+  FileHeader.Type = HeaderExtractor.getU16(&OffsetPtr);
+  uint32_t Bitfield = HeaderExtractor.getU32(&OffsetPtr);
+  FileHeader.ConstantTSC = Bitfield & 1uL;
+  FileHeader.NonstopTSC = Bitfield & 1uL << 1;
+  FileHeader.CycleFrequency = HeaderExtractor.getU64(&OffsetPtr);
+
+  if (FileHeader.Version != 1)
+    return make_error<StringError>(
+        Twine("Unsupported XRay file version: ") + Twine(FileHeader.Version),
+        std::make_error_code(std::errc::invalid_argument));
+
+  // Each record after the header will be 32 bytes, in the following format:
+  //
+  //   (2)   uint16 : record type
+  //   (1)   uint8  : cpu id
+  //   (1)   uint8  : type
+  //   (4)   sint32 : function id
+  //   (8)   uint64 : tsc
+  //   (4)   uint32 : thread id
+  //   (12)  -      : padding
+  for (auto S = Data.drop_front(32); !S.empty(); S = S.drop_front(32)) {
+    DataExtractor RecordExtractor(S, true, 8);
+    uint32_t OffsetPtr = 0;
+    Records.emplace_back();
+    auto &Record = Records.back();
+    Record.RecordType = RecordExtractor.getU16(&OffsetPtr);
+    Record.CPU = RecordExtractor.getU8(&OffsetPtr);
+    auto Type = RecordExtractor.getU8(&OffsetPtr);
+    switch (Type) {
+    case 0:
+      Record.Type = RecordTypes::ENTER;
+      break;
+    case 1:
+      Record.Type = RecordTypes::EXIT;
+      break;
+    default:
+      return make_error<StringError>(
+          Twine("Unknown record type '") + Twine(int{Type}) + "'",
+          std::make_error_code(std::errc::executable_format_error));
+    }
+    Record.FuncId = RecordExtractor.getSigned(&OffsetPtr, sizeof(int32_t));
+    Record.TSC = RecordExtractor.getU64(&OffsetPtr);
+    Record.TId = RecordExtractor.getU32(&OffsetPtr);
+  }
+  return Error::success();
+}
+
+Error YAMLLogLoader(StringRef Data, XRayFileHeader &FileHeader,
+                    std::vector<XRayRecord> &Records) {
+
+  // Load the documents from the MappedFile.
+  YAMLXRayTrace Trace;
+  Input In(Data);
+  In >> Trace;
+  if (In.error())
+    return make_error<StringError>("Failed loading YAML Data.", In.error());
+
+  FileHeader.Version = Trace.Header.Version;
+  FileHeader.Type = Trace.Header.Type;
+  FileHeader.ConstantTSC = Trace.Header.ConstantTSC;
+  FileHeader.NonstopTSC = Trace.Header.NonstopTSC;
+  FileHeader.CycleFrequency = Trace.Header.CycleFrequency;
+
+  if (FileHeader.Version != 1)
+    return make_error<StringError>(
+        Twine("Unsupported XRay file version: ") + Twine(FileHeader.Version),
+        std::make_error_code(std::errc::invalid_argument));
+
+  Records.clear();
+  std::transform(Trace.Records.begin(), Trace.Records.end(),
+                 std::back_inserter(Records), [&](const YAMLXRayRecord &R) {
+                   return XRayRecord{R.RecordType, R.CPU, R.Type,
+                                     R.FuncId,     R.TSC, R.TId};
+                 });
+  return Error::success();
+}
+
+Expected<Trace> llvm::xray::loadTraceFile(StringRef Filename, bool Sort) {
+  int Fd;
+  if (auto EC = sys::fs::openFileForRead(Filename, Fd)) {
+    return make_error<StringError>(
+        Twine("Cannot read log from '") + Filename + "'", EC);
+  }
+
+  // Attempt to get the filesize.
+  uint64_t FileSize;
+  if (auto EC = sys::fs::file_size(Filename, FileSize)) {
+    return make_error<StringError>(
+        Twine("Cannot read log from '") + Filename + "'", EC);
+  }
+  if (FileSize < 4) {
+    return make_error<StringError>(
+        Twine("File '") + Filename + "' too small for XRay.",
+        std::make_error_code(std::errc::executable_format_error));
+  }
+
+  // Attempt to mmap the file.
+  std::error_code EC;
+  sys::fs::mapped_file_region MappedFile(
+      Fd, sys::fs::mapped_file_region::mapmode::readonly, FileSize, 0, EC);
+  if (EC) {
+    return make_error<StringError>(
+        Twine("Cannot read log from '") + Filename + "'", EC);
+  }
+
+  // Attempt to detect the file type using file magic. We have a slight bias
+  // towards the binary format, and we do this by making sure that the first 4
+  // bytes of the binary file is some combination of the following byte
+  // patterns:
+  //
+  //   0x0001 0x0000 - version 1, "naive" format
+  //   0x0001 0x0001 - version 1, "flight data recorder" format
+  //
+  // YAML files dont' typically have those first four bytes as valid text so we
+  // try loading assuming YAML if we don't find these bytes.
+  //
+  // Only if we can't load either the binary or the YAML format will we yield an
+  // error.
+  StringRef Magic(MappedFile.data(), 4);
+  DataExtractor HeaderExtractor(Magic, true, 8);
+  uint32_t OffsetPtr = 0;
+  uint16_t Version = HeaderExtractor.getU16(&OffsetPtr);
+  uint16_t Type = HeaderExtractor.getU16(&OffsetPtr);
+
+  Trace T;
+  if (Version == 1 && (Type == 0 || Type == 1)) {
+    if (auto E = NaiveLogLoader(StringRef(MappedFile.data(), MappedFile.size()),
+                                T.FileHeader, T.Records))
+      return std::move(E);
+  } else {
+    if (auto E = YAMLLogLoader(StringRef(MappedFile.data(), MappedFile.size()),
+                               T.FileHeader, T.Records))
+      return std::move(E);
+  }
+
+  if (Sort)
+    std::sort(T.Records.begin(), T.Records.end(),
+              [&](const XRayRecord &L, const XRayRecord &R) {
+                return L.TSC < R.TSC;
+              });
+
+  return std::move(T);
+}